#![allow(clippy::cast_precision_loss)]
use std::fmt::Write;
use std::time::Duration;
use serde::{Deserialize, Serialize};
#[cfg(feature = "bench-http")]
use crate::http_client::{CompletionRequest, ModelHttpClient, OllamaOptions, OllamaRequest};
#[derive(Debug, Clone)]
pub struct DynamicSampler {
pub min_samples: usize,
pub max_samples: usize,
pub cv_threshold: f64,
pub cv_window: usize,
pub stability_count: usize,
stable_streak: usize,
}
impl Default for DynamicSampler {
fn default() -> Self {
Self {
min_samples: 100,
max_samples: 10_000,
cv_threshold: 0.05,
cv_window: 50,
stability_count: 3,
stable_streak: 0,
}
}
}
impl DynamicSampler {
#[must_use]
pub fn new(min_samples: usize, max_samples: usize, cv_threshold: f64) -> Self {
Self {
min_samples,
max_samples,
cv_threshold,
cv_window: 50,
stability_count: 3,
stable_streak: 0,
}
}
#[must_use]
pub fn should_continue(&mut self, samples: &[f64]) -> bool {
let n = samples.len();
if n < self.min_samples {
return true;
}
if n >= self.max_samples {
return false;
}
let window_start = n.saturating_sub(self.cv_window);
let window = &samples[window_start..];
let cv = compute_cv(window);
if cv < self.cv_threshold {
self.stable_streak += 1;
if self.stable_streak >= self.stability_count {
return false; }
} else {
self.stable_streak = 0;
}
true }
#[must_use]
pub fn current_cv(&self, samples: &[f64]) -> f64 {
if samples.len() < 2 {
return f64::INFINITY;
}
let window_start = samples.len().saturating_sub(self.cv_window);
compute_cv(&samples[window_start..])
}
pub fn reset(&mut self) {
self.stable_streak = 0;
}
}
fn compute_cv(data: &[f64]) -> f64 {
if data.len() < 2 {
return f64::INFINITY;
}
let n = data.len() as f64;
let mean = data.iter().sum::<f64>() / n;
if mean.abs() < 1e-10 {
return f64::INFINITY;
}
let variance = data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
let std_dev = variance.sqrt();
std_dev / mean.abs()
}
#[derive(Debug, Clone)]
pub struct ThermalGuard {
pub max_temp_c: f64,
pub cooldown_threshold_c: f64,
pub cooldown_sleep_ms: u64,
pub temp_variance_c: f64,
}
impl Default for ThermalGuard {
fn default() -> Self {
Self {
max_temp_c: 80.0,
cooldown_threshold_c: 70.0,
cooldown_sleep_ms: 10_000,
temp_variance_c: 2.0,
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum ThermalValidity {
Valid,
Invalid(String),
}
impl ThermalGuard {
#[must_use]
pub fn new(
max_temp_c: f64,
cooldown_threshold_c: f64,
cooldown_sleep_ms: u64,
temp_variance_c: f64,
) -> Self {
Self {
max_temp_c,
cooldown_threshold_c,
cooldown_sleep_ms,
temp_variance_c,
}
}
#[must_use]
pub fn needs_cooldown(&self, current_temp: f64) -> bool {
current_temp > self.max_temp_c
}
#[must_use]
pub fn validate_run(&self, temps: &[f64]) -> ThermalValidity {
if temps.is_empty() {
return ThermalValidity::Valid;
}
let variance = compute_variance(temps);
let std_dev = variance.sqrt();
if std_dev > self.temp_variance_c {
ThermalValidity::Invalid(format!(
"Temperature variance {std_dev:.2}°C exceeds threshold {:.2}°C",
self.temp_variance_c
))
} else {
ThermalValidity::Valid
}
}
pub fn cooldown_if_needed(&self, current_temp: f64) {
if current_temp > self.max_temp_c {
std::thread::sleep(Duration::from_millis(self.cooldown_sleep_ms));
}
}
#[must_use]
pub fn max_temp(&self, temps: &[f64]) -> f64 {
if temps.is_empty() {
return 0.0;
}
temps.iter().copied().fold(f64::NEG_INFINITY, f64::max)
}
#[must_use]
pub fn temp_variance(&self, temps: &[f64]) -> f64 {
compute_variance(temps).sqrt()
}
}
fn compute_variance(data: &[f64]) -> f64 {
if data.len() < 2 {
return 0.0;
}
let n = data.len() as f64;
let mean = data.iter().sum::<f64>() / n;
data.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct KvCacheMetrics {
pub allocated_bytes: u64,
pub used_bytes: u64,
pub fragmentation_pct: f64,
}
impl KvCacheMetrics {
#[must_use]
pub fn new(allocated_bytes: u64, used_bytes: u64) -> Self {
let waste = allocated_bytes.saturating_sub(used_bytes);
let fragmentation_pct = if allocated_bytes > 0 {
(waste as f64 / allocated_bytes as f64) * 100.0
} else {
0.0
};
Self {
allocated_bytes,
used_bytes,
fragmentation_pct,
}
}
#[must_use]
pub fn allocated_mb(&self) -> f64 {
self.allocated_bytes as f64 / (1024.0 * 1024.0)
}
#[must_use]
pub fn used_mb(&self) -> f64 {
self.used_bytes as f64 / (1024.0 * 1024.0)
}
#[must_use]
pub fn is_acceptable(&self, threshold_pct: f64) -> bool {
self.fragmentation_pct < threshold_pct
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct EnergyMetrics {
pub total_joules: f64,
pub idle_watts: f64,
pub active_watts_avg: f64,
pub tokens_generated: u64,
}
impl EnergyMetrics {
#[must_use]
pub fn new(total_joules: f64, idle_watts: f64, active_watts_avg: f64, tokens: u64) -> Self {
Self {
total_joules,
idle_watts,
active_watts_avg,
tokens_generated: tokens,
}
}
#[must_use]
pub fn joules_per_token(&self) -> f64 {
if self.tokens_generated == 0 {
return 0.0;
}
self.total_joules / self.tokens_generated as f64
}
#[must_use]
pub fn tokens_per_joule(&self) -> f64 {
if self.total_joules < 1e-10 {
return 0.0;
}
self.tokens_generated as f64 / self.total_joules
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ItlMetrics {
pub median_ms: f64,
pub std_dev_ms: f64,
pub p99_ms: f64,
pub p999_ms: f64,
}
impl ItlMetrics {
#[must_use]
pub fn from_measurements(itl_times_ms: &[f64]) -> Self {
if itl_times_ms.is_empty() {
return Self::default();
}
let mut sorted = itl_times_ms.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
let median_ms = if n % 2 == 0 {
(sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
} else {
sorted[n / 2]
};
let mean = itl_times_ms.iter().sum::<f64>() / n as f64;
let variance = itl_times_ms.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
/ (n as f64 - 1.0).max(1.0);
let std_dev_ms = variance.sqrt();
let percentile_99 = ((n as f64 * 0.99).ceil() as usize)
.saturating_sub(1)
.min(n - 1);
let percentile_999 = ((n as f64 * 0.999).ceil() as usize)
.saturating_sub(1)
.min(n - 1);
Self {
median_ms,
std_dev_ms,
p99_ms: sorted[percentile_99],
p999_ms: sorted[percentile_999],
}
}
#[must_use]
pub fn is_low_jitter(&self, threshold_ms: f64) -> bool {
self.std_dev_ms < threshold_ms
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum QualityResult {
Pass {
kl_divergence: f64,
},
Fail {
kl_divergence: f64,
threshold: f64,
message: &'static str,
},
}
fn softmax(logits: &[f32]) -> Vec<f64> {
let max_logit = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let exp_logits: Vec<f64> = logits
.iter()
.map(|x| ((*x - max_logit) as f64).exp())
.collect();
let sum: f64 = exp_logits.iter().sum();
exp_logits.iter().map(|x| x / sum).collect()
}
#[must_use]
pub fn validate_quantization_quality(
fp32_logits: &[f32],
quantized_logits: &[f32],
threshold: f64,
) -> QualityResult {
if fp32_logits.len() != quantized_logits.len() {
return QualityResult::Fail {
kl_divergence: f64::INFINITY,
threshold,
message: "Logit vector lengths do not match",
};
}
if fp32_logits.is_empty() {
return QualityResult::Pass { kl_divergence: 0.0 };
}
let fp32_probs = softmax(fp32_logits);
let quant_probs = softmax(quantized_logits);
let kl_div: f64 = fp32_probs
.iter()
.zip(&quant_probs)
.map(|(p, q)| {
if *p > 1e-10 && *q > 1e-10 {
p * (p / q).ln()
} else {
0.0
}
})
.sum();
if kl_div < threshold {
QualityResult::Pass {
kl_divergence: kl_div,
}
} else {
QualityResult::Fail {
kl_divergence: kl_div,
threshold,
message: "Quantization quality degradation detected",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkConfig {
pub model: String,
pub format: String,
pub quantization: String,
pub runtime: String,
pub runtime_version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkResult {
pub config: BenchmarkConfig,
pub cold_start_ms: f64,
pub model_load_ms: f64,
pub ttft_ms: Vec<f64>,
pub itl_ms: Vec<f64>,
pub generation_tok_s: Vec<f64>,
pub peak_memory_mb: u64,
pub kv_cache_waste_pct: f64,
pub energy_joules: f64,
pub tokens_generated: u64,
pub actual_iterations: usize,
pub cv_at_stop: f64,
pub timestamp: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSummary {
pub ttft_p50: f64,
pub ttft_p95: f64,
pub ttft_p99: f64,
pub ttft_p999: f64,
pub itl_median: f64,
pub itl_std_dev: f64,
pub throughput_median: f64,
pub throughput_ci_95: (f64, f64),
pub token_joules: f64,
pub memory_waste_pct: f64,
pub iterations: usize,
pub cv_final: f64,
}
impl BenchmarkResult {
#[must_use]
pub fn summary(&self) -> BenchmarkSummary {
BenchmarkSummary {
ttft_p50: percentile(&self.ttft_ms, 50.0),
ttft_p95: percentile(&self.ttft_ms, 95.0),
ttft_p99: percentile(&self.ttft_ms, 99.0),
ttft_p999: percentile(&self.ttft_ms, 99.9),
itl_median: percentile(&self.itl_ms, 50.0),
itl_std_dev: compute_std_dev(&self.itl_ms),
throughput_median: percentile(&self.generation_tok_s, 50.0),
throughput_ci_95: bootstrap_ci(&self.generation_tok_s, 0.95, 1000),
token_joules: if self.tokens_generated > 0 {
self.energy_joules / self.tokens_generated as f64
} else {
0.0
},
memory_waste_pct: self.kv_cache_waste_pct,
iterations: self.actual_iterations,
cv_final: self.cv_at_stop,
}
}
}
fn percentile(data: &[f64], p: f64) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut sorted = data.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let idx = ((sorted.len() as f64 * p / 100.0).ceil() as usize)
.saturating_sub(1)
.min(sorted.len() - 1);
sorted[idx]
}
fn compute_std_dev(data: &[f64]) -> f64 {
compute_variance(data).sqrt()
}
fn bootstrap_ci(data: &[f64], confidence: f64, n_resamples: usize) -> (f64, f64) {
if data.is_empty() {
return (0.0, 0.0);
}
let mut bootstrap_means = Vec::with_capacity(n_resamples);
let n = data.len();
for i in 0..n_resamples {
let seed = (i as u64)
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1);
let mut sum = 0.0;
for j in 0..n {
let idx = ((seed.wrapping_mul(j as u64 + 1)) as usize) % n;
sum += data[idx];
}
bootstrap_means.push(sum / n as f64);
}
bootstrap_means.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let alpha = 1.0 - confidence;
let lower_idx = ((n_resamples as f64 * alpha / 2.0).floor() as usize).min(n_resamples - 1);
let upper_idx =
((n_resamples as f64 * (1.0 - alpha / 2.0)).ceil() as usize).min(n_resamples - 1);
(bootstrap_means[lower_idx], bootstrap_means[upper_idx])
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WorkloadType {
ShortQa,
LongContext,
}
impl WorkloadType {
#[must_use]
pub const fn input_tokens(&self) -> usize {
match self {
Self::ShortQa => 32,
Self::LongContext => 2048,
}
}
#[must_use]
pub const fn output_tokens(&self) -> usize {
match self {
Self::ShortQa => 64,
Self::LongContext => 512,
}
}
}
#[derive(Debug, Clone)]
pub struct ConvoyTestConfig {
pub long_requests: usize,
pub short_requests: usize,
pub max_p99_increase_pct: f64,
pub max_hol_blocking_ms: f64,
pub max_kv_fragmentation_pct: f64,
}
impl Default for ConvoyTestConfig {
fn default() -> Self {
Self {
long_requests: 10,
short_requests: 100,
max_p99_increase_pct: 50.0,
max_hol_blocking_ms: 500.0,
max_kv_fragmentation_pct: 15.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConvoyRequestResult {
pub workload_type: String,
pub queue_time_ms: f64,
pub ttft_ms: f64,
pub total_latency_ms: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConvoyTestResult {
pub long_requests: usize,
pub short_requests: usize,
pub baseline_short_p99_ms: f64,
pub convoy_short_p99_ms: f64,
pub p99_increase_pct: f64,
pub max_hol_blocking_ms: f64,
pub avg_hol_blocking_ms: f64,
pub kv_fragmentation_pct: f64,
pub passed: bool,
pub failure_reasons: Vec<String>,
}
impl ConvoyTestResult {
#[must_use]
pub fn new(
config: &ConvoyTestConfig,
baseline_short_latencies: &[f64],
convoy_short_latencies: &[f64],
hol_blocking_times: &[f64],
kv_fragmentation_pct: f64,
) -> Self {
let baseline_short_p99 = percentile(baseline_short_latencies, 99.0);
let convoy_short_p99 = percentile(convoy_short_latencies, 99.0);
let p99_increase_pct = if baseline_short_p99 > 0.0 {
((convoy_short_p99 - baseline_short_p99) / baseline_short_p99) * 100.0
} else {
0.0
};
let max_hol_blocking = hol_blocking_times.iter().copied().fold(0.0_f64, f64::max);
let avg_hol_blocking = if hol_blocking_times.is_empty() {
0.0
} else {
hol_blocking_times.iter().sum::<f64>() / hol_blocking_times.len() as f64
};
let mut failure_reasons = Vec::new();
if p99_increase_pct > config.max_p99_increase_pct {
failure_reasons.push(format!(
"P99 increase {p99_increase_pct:.1}% exceeds threshold {:.1}%",
config.max_p99_increase_pct
));
}
if max_hol_blocking > config.max_hol_blocking_ms {
failure_reasons.push(format!(
"Max HOL blocking {max_hol_blocking:.1}ms exceeds threshold {:.1}ms",
config.max_hol_blocking_ms
));
}
if kv_fragmentation_pct > config.max_kv_fragmentation_pct {
failure_reasons.push(format!(
"KV fragmentation {kv_fragmentation_pct:.1}% exceeds threshold {:.1}%",
config.max_kv_fragmentation_pct
));
}
Self {
long_requests: config.long_requests,
short_requests: config.short_requests,
baseline_short_p99_ms: baseline_short_p99,
convoy_short_p99_ms: convoy_short_p99,
p99_increase_pct,
max_hol_blocking_ms: max_hol_blocking,
avg_hol_blocking_ms: avg_hol_blocking,
kv_fragmentation_pct,
passed: failure_reasons.is_empty(),
failure_reasons,
}
}
}
#[derive(Debug, Clone)]
pub struct SaturationTestConfig {
pub cpu_load_pct: u8,
pub max_throughput_degradation_pct: f64,
pub max_p99_increase_pct: f64,
}
impl Default for SaturationTestConfig {
fn default() -> Self {
Self {
cpu_load_pct: 50,
max_throughput_degradation_pct: 30.0,
max_p99_increase_pct: 100.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SaturationTestResult {
pub cpu_load_pct: u8,
pub baseline_throughput: f64,
pub stressed_throughput: f64,
pub throughput_degradation_pct: f64,
pub baseline_p99_ms: f64,
pub stressed_p99_ms: f64,
pub p99_increase_pct: f64,
pub passed: bool,
pub failure_reasons: Vec<String>,
}
impl SaturationTestResult {
#[must_use]
pub fn new(
config: &SaturationTestConfig,
baseline_throughputs: &[f64],
stressed_throughputs: &[f64],
baseline_latencies: &[f64],
stressed_latencies: &[f64],
) -> Self {
let baseline_throughput = if baseline_throughputs.is_empty() {
0.0
} else {
baseline_throughputs.iter().sum::<f64>() / baseline_throughputs.len() as f64
};
let stressed_throughput = if stressed_throughputs.is_empty() {
0.0
} else {
stressed_throughputs.iter().sum::<f64>() / stressed_throughputs.len() as f64
};
let throughput_degradation_pct = if baseline_throughput > 0.0 {
((baseline_throughput - stressed_throughput) / baseline_throughput) * 100.0
} else {
0.0
};
let baseline_p99 = percentile(baseline_latencies, 99.0);
let stressed_p99 = percentile(stressed_latencies, 99.0);
let p99_increase_pct = if baseline_p99 > 0.0 {
((stressed_p99 - baseline_p99) / baseline_p99) * 100.0
} else {
0.0
};
let mut failure_reasons = Vec::new();
if throughput_degradation_pct > config.max_throughput_degradation_pct {
failure_reasons.push(format!(
"Throughput degradation {throughput_degradation_pct:.1}% exceeds threshold {:.1}%",
config.max_throughput_degradation_pct
));
}
if p99_increase_pct > config.max_p99_increase_pct {
failure_reasons.push(format!(
"P99 increase {p99_increase_pct:.1}% exceeds threshold {:.1}%",
config.max_p99_increase_pct
));
}
Self {
cpu_load_pct: config.cpu_load_pct,
baseline_throughput,
stressed_throughput,
throughput_degradation_pct,
baseline_p99_ms: baseline_p99,
stressed_p99_ms: stressed_p99,
p99_increase_pct,
passed: failure_reasons.is_empty(),
failure_reasons,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HardwareSpec {
pub cpu: String,
pub gpu: Option<String>,
pub memory_gb: u64,
pub storage: String,
}
impl Default for HardwareSpec {
fn default() -> Self {
Self {
cpu: "Unknown".to_string(),
gpu: None,
memory_gb: 0,
storage: "Unknown".to_string(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SamplingConfig {
pub method: String,
pub cv_threshold: f64,
pub actual_iterations: usize,
pub cv_at_stop: f64,
pub warmup_iterations: usize,
}
impl Default for SamplingConfig {
fn default() -> Self {
Self {
method: "dynamic_cv".to_string(),
cv_threshold: 0.05,
actual_iterations: 0,
cv_at_stop: 0.0,
warmup_iterations: 100,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThermalInfo {
pub valid: bool,
pub temp_variance_c: f64,
pub max_temp_c: f64,
}
impl Default for ThermalInfo {
fn default() -> Self {
Self {
valid: true,
temp_variance_c: 0.0,
max_temp_c: 0.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TtftResults {
pub p50: f64,
pub p95: f64,
pub p99: f64,
pub p999: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ItlResults {
pub median: f64,
pub std_dev: f64,
pub p99: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThroughputResults {
pub median: f64,
pub ci_95: (f64, f64),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MemoryResults {
pub model_mb: u64,
pub peak_rss_mb: u64,
pub kv_waste_pct: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnergyResults {
pub total_joules: f64,
pub token_joules: f64,
pub idle_watts: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ColdStartResults {
pub median: f64,
pub p99: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityValidation {
pub kl_divergence_vs_fp32: f64,
pub perplexity_wikitext2: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FullBenchmarkResult {
pub version: String,
pub timestamp: String,
pub config: BenchmarkConfig,
pub hardware: HardwareSpec,
pub sampling: SamplingConfig,
pub thermal: ThermalInfo,
pub results: BenchmarkResults,
pub quality: QualityValidation,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkResults {
pub ttft_ms: TtftResults,
pub itl_ms: ItlResults,
pub throughput_tok_s: ThroughputResults,
pub memory_mb: MemoryResults,
pub energy: EnergyResults,
pub cold_start_ms: ColdStartResults,
}
impl FullBenchmarkResult {
#[must_use]
pub fn from_benchmark_result(
result: &BenchmarkResult,
hardware: HardwareSpec,
thermal_temps: &[f64],
kl_divergence: f64,
) -> Self {
let thermal_guard = ThermalGuard::default();
let thermal_validity = thermal_guard.validate_run(thermal_temps);
let summary = result.summary();
Self {
version: "1.1".to_string(),
timestamp: chrono_timestamp(),
config: result.config.clone(),
hardware,
sampling: SamplingConfig {
method: "dynamic_cv".to_string(),
cv_threshold: 0.05,
actual_iterations: result.actual_iterations,
cv_at_stop: result.cv_at_stop,
warmup_iterations: 100,
},
thermal: ThermalInfo {
valid: thermal_validity == ThermalValidity::Valid,
temp_variance_c: thermal_guard.temp_variance(thermal_temps),
max_temp_c: thermal_guard.max_temp(thermal_temps),
},
results: BenchmarkResults {
ttft_ms: TtftResults {
p50: summary.ttft_p50,
p95: summary.ttft_p95,
p99: summary.ttft_p99,
p999: summary.ttft_p999,
},
itl_ms: ItlResults {
median: summary.itl_median,
std_dev: summary.itl_std_dev,
p99: percentile(&result.itl_ms, 99.0),
},
throughput_tok_s: ThroughputResults {
median: summary.throughput_median,
ci_95: summary.throughput_ci_95,
},
memory_mb: MemoryResults {
model_mb: result.peak_memory_mb / 2, peak_rss_mb: result.peak_memory_mb,
kv_waste_pct: result.kv_cache_waste_pct,
},
energy: EnergyResults {
total_joules: result.energy_joules,
token_joules: summary.token_joules,
idle_watts: 0.0, },
cold_start_ms: ColdStartResults {
median: result.cold_start_ms,
p99: result.cold_start_ms * 1.5, },
},
quality: QualityValidation {
kl_divergence_vs_fp32: kl_divergence,
perplexity_wikitext2: None,
},
}
}
pub fn to_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string_pretty(self)
}
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
serde_json::from_str(json)
}
}
fn chrono_timestamp() -> String {
use std::time::{SystemTime, UNIX_EPOCH};
let duration = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default();
let secs = duration.as_secs();
format!("1970-01-01T00:00:00Z+{secs}s")
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkComparison {
pub baseline_runtime: String,
pub current_runtime: String,
pub ttft_p99_change_pct: f64,
pub throughput_change_pct: f64,
pub memory_change_pct: f64,
pub energy_change_pct: f64,
pub winner: String,
pub significance: f64,
}
impl BenchmarkComparison {
#[must_use]
pub fn compare(baseline: &FullBenchmarkResult, current: &FullBenchmarkResult) -> Self {
let ttft_p99_change = if baseline.results.ttft_ms.p99 > 0.0 {
((current.results.ttft_ms.p99 - baseline.results.ttft_ms.p99)
/ baseline.results.ttft_ms.p99)
* 100.0
} else {
0.0
};
let throughput_change = if baseline.results.throughput_tok_s.median > 0.0 {
((current.results.throughput_tok_s.median - baseline.results.throughput_tok_s.median)
/ baseline.results.throughput_tok_s.median)
* 100.0
} else {
0.0
};
let memory_change = if baseline.results.memory_mb.peak_rss_mb > 0 {
((current.results.memory_mb.peak_rss_mb as f64
- baseline.results.memory_mb.peak_rss_mb as f64)
/ baseline.results.memory_mb.peak_rss_mb as f64)
* 100.0
} else {
0.0
};
let energy_change = if baseline.results.energy.token_joules > 0.0 {
((current.results.energy.token_joules - baseline.results.energy.token_joules)
/ baseline.results.energy.token_joules)
* 100.0
} else {
0.0
};
let mut current_wins = 0;
let mut baseline_wins = 0;
if ttft_p99_change < -5.0 {
current_wins += 1;
} else if ttft_p99_change > 5.0 {
baseline_wins += 1;
}
if throughput_change > 5.0 {
current_wins += 1;
} else if throughput_change < -5.0 {
baseline_wins += 1;
}
if memory_change < -5.0 {
current_wins += 1;
} else if memory_change > 5.0 {
baseline_wins += 1;
}
if energy_change < -5.0 {
current_wins += 1;
} else if energy_change > 5.0 {
baseline_wins += 1;
}
let winner = match current_wins.cmp(&baseline_wins) {
std::cmp::Ordering::Greater => current.config.runtime.clone(),
std::cmp::Ordering::Less => baseline.config.runtime.clone(),
std::cmp::Ordering::Equal => "tie".to_string(),
};
Self {
baseline_runtime: baseline.config.runtime.clone(),
current_runtime: current.config.runtime.clone(),
ttft_p99_change_pct: ttft_p99_change,
throughput_change_pct: throughput_change,
memory_change_pct: memory_change,
energy_change_pct: energy_change,
winner,
significance: 0.001, }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RegressionResult {
pub regression_detected: bool,
pub regressed_metrics: Vec<String>,
pub threshold_pct: f64,
}
impl RegressionResult {
#[must_use]
pub fn check(
baseline: &FullBenchmarkResult,
current: &FullBenchmarkResult,
threshold_pct: f64,
) -> Self {
let mut regressed_metrics = Vec::new();
if baseline.results.ttft_ms.p99 > 0.0 {
let change = ((current.results.ttft_ms.p99 - baseline.results.ttft_ms.p99)
/ baseline.results.ttft_ms.p99)
* 100.0;
if change > threshold_pct {
regressed_metrics.push(format!("ttft_p99 (+{change:.1}%)"));
}
}
if baseline.results.throughput_tok_s.median > 0.0 {
let change = ((baseline.results.throughput_tok_s.median
- current.results.throughput_tok_s.median)
/ baseline.results.throughput_tok_s.median)
* 100.0;
if change > threshold_pct {
regressed_metrics.push(format!("throughput (-{change:.1}%)"));
}
}
if baseline.results.memory_mb.peak_rss_mb > 0 {
let change = ((current.results.memory_mb.peak_rss_mb as f64
- baseline.results.memory_mb.peak_rss_mb as f64)
/ baseline.results.memory_mb.peak_rss_mb as f64)
* 100.0;
if change > threshold_pct {
regressed_metrics.push(format!("memory (+{change:.1}%)"));
}
}
Self {
regression_detected: !regressed_metrics.is_empty(),
regressed_metrics,
threshold_pct,
}
}
}
use std::collections::HashMap;
use crate::error::RealizarError;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum RuntimeType {
Realizar,
LlamaCpp,
Vllm,
Ollama,
}
impl RuntimeType {
#[must_use]
pub fn as_str(&self) -> &'static str {
match self {
Self::Realizar => "realizar",
Self::LlamaCpp => "llama-cpp",
Self::Vllm => "vllm",
Self::Ollama => "ollama",
}
}
#[must_use]
pub fn parse(s: &str) -> Option<Self> {
match s.to_lowercase().as_str() {
"realizar" => Some(Self::Realizar),
"llama-cpp" | "llama.cpp" | "llamacpp" => Some(Self::LlamaCpp),
"vllm" => Some(Self::Vllm),
"ollama" => Some(Self::Ollama),
_ => None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InferenceRequest {
pub prompt: String,
pub max_tokens: usize,
pub temperature: f64,
pub stop: Vec<String>,
}
impl Default for InferenceRequest {
fn default() -> Self {
Self {
prompt: String::new(),
max_tokens: 100,
temperature: 0.7,
stop: Vec::new(),
}
}
}
impl InferenceRequest {
#[must_use]
pub fn new(prompt: &str) -> Self {
Self {
prompt: prompt.to_string(),
..Default::default()
}
}
#[must_use]
pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
self.max_tokens = max_tokens;
self
}
#[must_use]
pub fn with_temperature(mut self, temperature: f64) -> Self {
self.temperature = temperature;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InferenceResponse {
pub text: String,
pub tokens_generated: usize,
pub ttft_ms: f64,
pub total_time_ms: f64,
pub itl_ms: Vec<f64>,
}
impl InferenceResponse {
#[must_use]
pub fn tokens_per_second(&self) -> f64 {
if self.total_time_ms <= 0.0 {
return 0.0;
}
(self.tokens_generated as f64) / (self.total_time_ms / 1000.0)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BackendInfo {
pub runtime_type: RuntimeType,
pub version: String,
pub supports_streaming: bool,
pub loaded_model: Option<String>,
}
pub trait RuntimeBackend: Send + Sync {
fn info(&self) -> BackendInfo;
fn inference(&self, request: &InferenceRequest) -> Result<InferenceResponse, RealizarError>;
fn load_model(&mut self, _model_path: &str) -> Result<(), RealizarError> {
Ok(()) }
}
pub struct MockBackend {
ttft_ms: f64,
tokens_per_second: f64,
}
impl MockBackend {
#[must_use]
pub fn new(ttft_ms: f64, tokens_per_second: f64) -> Self {
Self {
ttft_ms,
tokens_per_second,
}
}
}
impl RuntimeBackend for MockBackend {
fn info(&self) -> BackendInfo {
BackendInfo {
runtime_type: RuntimeType::Realizar,
version: env!("CARGO_PKG_VERSION").to_string(),
supports_streaming: true,
loaded_model: None,
}
}
fn inference(&self, request: &InferenceRequest) -> Result<InferenceResponse, RealizarError> {
let tokens = request.max_tokens.min(100);
let gen_time_ms = (tokens as f64) / self.tokens_per_second * 1000.0;
Ok(InferenceResponse {
text: "Mock response".to_string(),
tokens_generated: tokens,
ttft_ms: self.ttft_ms,
total_time_ms: self.ttft_ms + gen_time_ms,
itl_ms: vec![gen_time_ms / tokens as f64; tokens],
})
}
}
pub struct BackendRegistry {
backends: HashMap<RuntimeType, Box<dyn RuntimeBackend>>,
}
impl BackendRegistry {
#[must_use]
pub fn new() -> Self {
Self {
backends: HashMap::new(),
}
}
pub fn register(&mut self, runtime: RuntimeType, backend: Box<dyn RuntimeBackend>) {
self.backends.insert(runtime, backend);
}
#[must_use]
pub fn get(&self, runtime: RuntimeType) -> Option<&dyn RuntimeBackend> {
self.backends.get(&runtime).map(AsRef::as_ref)
}
#[must_use]
pub fn list(&self) -> Vec<RuntimeType> {
self.backends.keys().copied().collect()
}
}
impl Default for BackendRegistry {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlamaCppConfig {
pub binary_path: String,
pub model_path: Option<String>,
pub n_gpu_layers: u32,
pub ctx_size: usize,
pub threads: usize,
}
impl Default for LlamaCppConfig {
fn default() -> Self {
Self {
binary_path: "llama-cli".to_string(),
model_path: None,
n_gpu_layers: 0,
ctx_size: 2048,
threads: 4,
}
}
}
impl LlamaCppConfig {
#[must_use]
pub fn new(binary_path: &str) -> Self {
Self {
binary_path: binary_path.to_string(),
..Default::default()
}
}
#[must_use]
pub fn with_model(mut self, model_path: &str) -> Self {
self.model_path = Some(model_path.to_string());
self
}
#[must_use]
pub fn with_gpu_layers(mut self, layers: u32) -> Self {
self.n_gpu_layers = layers;
self
}
#[must_use]
pub fn with_ctx_size(mut self, ctx_size: usize) -> Self {
self.ctx_size = ctx_size;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VllmConfig {
pub base_url: String,
pub api_version: String,
pub model: Option<String>,
pub api_key: Option<String>,
}
impl Default for VllmConfig {
fn default() -> Self {
Self {
base_url: "http://localhost:8000".to_string(),
api_version: "v1".to_string(),
model: None,
api_key: None,
}
}
}
impl VllmConfig {
#[must_use]
pub fn new(base_url: &str) -> Self {
Self {
base_url: base_url.to_string(),
..Default::default()
}
}
#[must_use]
pub fn with_model(mut self, model: &str) -> Self {
self.model = Some(model.to_string());
self
}
#[must_use]
pub fn with_api_key(mut self, api_key: &str) -> Self {
self.api_key = Some(api_key.to_string());
self
}
}
pub struct LlamaCppBackend {
config: LlamaCppConfig,
}
impl LlamaCppBackend {
#[must_use]
pub fn new(config: LlamaCppConfig) -> Self {
Self { config }
}
#[must_use]
pub fn build_cli_args(&self, request: &InferenceRequest) -> Vec<String> {
let mut args = Vec::new();
if let Some(ref model_path) = self.config.model_path {
args.push("-m".to_string());
args.push(model_path.clone());
}
args.push("-p".to_string());
args.push(request.prompt.clone());
args.push("-n".to_string());
args.push(request.max_tokens.to_string());
args.push("-ngl".to_string());
args.push(self.config.n_gpu_layers.to_string());
args.push("-c".to_string());
args.push(self.config.ctx_size.to_string());
args.push("-t".to_string());
args.push(self.config.threads.to_string());
if (request.temperature - 0.8).abs() > 0.01 {
args.push("--temp".to_string());
args.push(format!("{:.2}", request.temperature));
}
args
}
#[must_use]
pub fn parse_timing_line(output: &str, metric_name: &str) -> Option<(f64, usize)> {
for line in output.lines() {
let matches = if metric_name == "eval time" {
line.contains(metric_name) && !line.contains("prompt eval time")
} else {
line.contains(metric_name)
};
if matches && line.contains('=') {
if let Some(eq_pos) = line.find('=') {
let after_eq = &line[eq_pos + 1..];
if let Some(ms_pos) = after_eq.find("ms") {
let value_str = after_eq[..ms_pos].trim();
if let Ok(value) = value_str.parse::<f64>() {
if let Some(slash_pos) = after_eq.find('/') {
let after_slash = &after_eq[slash_pos + 1..];
let count_str =
after_slash.split_whitespace().next().unwrap_or("0");
if let Ok(count) = count_str.parse::<usize>() {
return Some((value, count));
}
}
}
}
}
}
}
None
}
#[must_use]
pub fn extract_generated_text(output: &str) -> String {
let mut text_lines = Vec::new();
for line in output.lines() {
if line.contains("llama_perf_") || line.contains("sampler") {
break;
}
text_lines.push(line);
}
text_lines.join("\n").trim().to_string()
}
pub fn parse_cli_output(output: &str) -> Result<InferenceResponse, RealizarError> {
let text = Self::extract_generated_text(output);
let ttft_ms = Self::parse_timing_line(output, "prompt eval time").map_or(0.0, |(ms, _)| ms);
let (total_time_ms, _) = Self::parse_timing_line(output, "total time").unwrap_or((0.0, 0));
let (_, tokens_generated) =
Self::parse_timing_line(output, "eval time").unwrap_or((0.0, 0));
let eval_time = Self::parse_timing_line(output, "eval time").map_or(0.0, |(ms, _)| ms);
let itl_ms = if tokens_generated > 1 {
let avg_itl = eval_time / (tokens_generated as f64);
vec![avg_itl; tokens_generated.saturating_sub(1)]
} else {
vec![]
};
Ok(InferenceResponse {
text,
tokens_generated,
ttft_ms,
total_time_ms,
itl_ms,
})
}
}
impl RuntimeBackend for LlamaCppBackend {
fn info(&self) -> BackendInfo {
BackendInfo {
runtime_type: RuntimeType::LlamaCpp,
version: "b2345".to_string(), supports_streaming: false, loaded_model: self.config.model_path.clone(),
}
}
fn inference(&self, request: &InferenceRequest) -> Result<InferenceResponse, RealizarError> {
use std::process::Command;
let model_path = self.config.model_path.as_ref().ok_or_else(|| {
RealizarError::InvalidConfiguration("model_path is required".to_string())
})?;
let args = self.build_cli_args(request);
let output = Command::new(&self.config.binary_path)
.args(&args)
.output()
.map_err(|e| {
RealizarError::ModelNotFound(format!(
"Failed to execute {}: {}",
self.config.binary_path, e
))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(RealizarError::InferenceError(format!(
"llama-cli failed: {} (model: {})",
stderr, model_path
)));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let stderr = String::from_utf8_lossy(&output.stderr);
let combined_output = format!("{}\n{}", stdout, stderr);
Self::parse_cli_output(&combined_output)
}
}
#[cfg(feature = "bench-http")]
pub struct VllmBackend {
config: VllmConfig,
http_client: ModelHttpClient,
}
#[cfg(feature = "bench-http")]
impl VllmBackend {
#[must_use]
pub fn new(config: VllmConfig) -> Self {
Self {
config,
http_client: ModelHttpClient::new(),
}
}
#[must_use]
pub fn with_client(config: VllmConfig, client: ModelHttpClient) -> Self {
Self {
config,
http_client: client,
}
}
}
#[cfg(feature = "bench-http")]
impl RuntimeBackend for VllmBackend {
fn info(&self) -> BackendInfo {
BackendInfo {
runtime_type: RuntimeType::Vllm,
version: "0.4.0".to_string(), supports_streaming: true,
loaded_model: self.config.model.clone(),
}
}
fn inference(&self, request: &InferenceRequest) -> Result<InferenceResponse, RealizarError> {
let url = &self.config.base_url;
if let Some(port_str) = url.split(':').next_back() {
if let Ok(port) = port_str.parse::<u32>() {
if port > 65535 {
return Err(RealizarError::ConnectionError(format!(
"Invalid port in URL: {}",
url
)));
}
}
}
#[allow(clippy::cast_possible_truncation)]
let completion_request = CompletionRequest {
model: self
.config
.model
.clone()
.unwrap_or_else(|| "default".to_string()),
prompt: request.prompt.clone(),
max_tokens: request.max_tokens,
temperature: Some(request.temperature as f32),
stream: false,
};
let timing = self.http_client.openai_completion(
&self.config.base_url,
&completion_request,
self.config.api_key.as_deref(),
)?;
Ok(InferenceResponse {
text: timing.text,
tokens_generated: timing.tokens_generated,
ttft_ms: timing.ttft_ms,
total_time_ms: timing.total_time_ms,
itl_ms: vec![], })
}
}
#[cfg(feature = "bench-http")]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OllamaConfig {
pub base_url: String,
pub model: String,
}
#[cfg(feature = "bench-http")]
impl Default for OllamaConfig {
fn default() -> Self {
Self {
base_url: "http://localhost:11434".to_string(),
model: "llama2".to_string(),
}
}
}
#[cfg(feature = "bench-http")]
pub struct OllamaBackend {
config: OllamaConfig,
http_client: ModelHttpClient,
}
#[cfg(feature = "bench-http")]
impl OllamaBackend {
#[must_use]
pub fn new(config: OllamaConfig) -> Self {
Self {
config,
http_client: ModelHttpClient::new(),
}
}
#[must_use]
pub fn with_client(config: OllamaConfig, client: ModelHttpClient) -> Self {
Self {
config,
http_client: client,
}
}
}
#[cfg(feature = "bench-http")]
impl RuntimeBackend for OllamaBackend {
fn info(&self) -> BackendInfo {
BackendInfo {
runtime_type: RuntimeType::Ollama,
version: "0.1.0".to_string(), supports_streaming: true,
loaded_model: Some(self.config.model.clone()),
}
}
fn inference(&self, request: &InferenceRequest) -> Result<InferenceResponse, RealizarError> {
#[allow(clippy::cast_possible_truncation)]
let ollama_request = OllamaRequest {
model: self.config.model.clone(),
prompt: request.prompt.clone(),
stream: false,
options: Some(OllamaOptions {
num_predict: Some(request.max_tokens),
temperature: Some(request.temperature as f32),
}),
};
let timing = self
.http_client
.ollama_generate(&self.config.base_url, &ollama_request)?;
Ok(InferenceResponse {
text: timing.text,
tokens_generated: timing.tokens_generated,
ttft_ms: timing.ttft_ms,
total_time_ms: timing.total_time_ms,
itl_ms: vec![], })
}
}
#[derive(Debug, Clone)]
pub struct MeasurementProtocol {
pub latency_samples: usize,
pub latency_percentiles: Vec<f64>,
pub throughput_duration: Duration,
pub throughput_ramp_up: Duration,
pub memory_samples: usize,
pub memory_interval: Duration,
}
impl Default for MeasurementProtocol {
fn default() -> Self {
Self {
latency_samples: 100,
latency_percentiles: vec![50.0, 90.0, 95.0, 99.0, 99.9],
throughput_duration: Duration::from_secs(60),
throughput_ramp_up: Duration::from_secs(10),
memory_samples: 10,
memory_interval: Duration::from_secs(1),
}
}
}
impl MeasurementProtocol {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_latency_samples(mut self, samples: usize) -> Self {
self.latency_samples = samples;
self
}
#[must_use]
pub fn with_percentiles(mut self, percentiles: Vec<f64>) -> Self {
self.latency_percentiles = percentiles;
self
}
#[must_use]
pub fn with_throughput_duration(mut self, duration: Duration) -> Self {
self.throughput_duration = duration;
self
}
#[must_use]
pub fn with_memory_samples(mut self, samples: usize) -> Self {
self.memory_samples = samples;
self
}
}
#[derive(Debug, Clone)]
pub struct LatencyStatistics {
pub mean: Duration,
pub std_dev: Duration,
pub min: Duration,
pub max: Duration,
pub p50: Duration,
pub p90: Duration,
pub p95: Duration,
pub p99: Duration,
pub p999: Duration,
pub samples: usize,
pub confidence_interval_95: (Duration, Duration),
}
impl LatencyStatistics {
#[must_use]
pub fn from_samples(samples: &[Duration]) -> Self {
assert!(!samples.is_empty(), "samples must not be empty");
let n = samples.len();
let n_f64 = n as f64;
let sum_nanos: u128 = samples.iter().map(Duration::as_nanos).sum();
let mean_nanos = sum_nanos / n as u128;
let mean = Duration::from_nanos(mean_nanos as u64);
let variance: f64 = samples
.iter()
.map(|s| {
let diff = s.as_nanos() as f64 - mean_nanos as f64;
diff * diff
})
.sum::<f64>()
/ (n_f64 - 1.0).max(1.0);
let std_dev_nanos = variance.sqrt();
let std_dev = Duration::from_nanos(std_dev_nanos as u64);
let mut sorted: Vec<Duration> = samples.to_vec();
sorted.sort();
let min = sorted[0];
let max = sorted[n - 1];
let percentile = |p: f64| -> Duration {
let idx = ((p / 100.0) * n_f64).ceil() as usize;
sorted[idx.saturating_sub(1).min(n - 1)]
};
let p50 = percentile(50.0);
let p90 = percentile(90.0);
let p95 = percentile(95.0);
let p99 = percentile(99.0);
let p999 = percentile(99.9);
let t_value = if n >= 30 { 1.96 } else { 2.0 + 4.0 / n_f64 };
let margin = std_dev_nanos * t_value / n_f64.sqrt();
let lower = Duration::from_nanos((mean_nanos as f64 - margin).max(0.0) as u64);
let upper = Duration::from_nanos((mean_nanos as f64 + margin) as u64);
Self {
mean,
std_dev,
min,
max,
p50,
p90,
p95,
p99,
p999,
samples: n,
confidence_interval_95: (lower, upper),
}
}
}
pub fn detect_outliers(samples: &[f64], threshold: f64) -> Vec<usize> {
if samples.len() < 3 {
return Vec::new();
}
let mut sorted = samples.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let median = if sorted.len() % 2 == 0 {
(sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0
} else {
sorted[sorted.len() / 2]
};
let mut deviations: Vec<f64> = samples.iter().map(|x| (x - median).abs()).collect();
deviations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mad = if deviations.len() % 2 == 0 {
(deviations[deviations.len() / 2 - 1] + deviations[deviations.len() / 2]) / 2.0
} else {
deviations[deviations.len() / 2]
};
if mad < f64::EPSILON {
return Vec::new();
}
let k = 1.4826;
samples
.iter()
.enumerate()
.filter(|(_, &x)| {
let modified_z = (x - median) / (k * mad);
modified_z.abs() > threshold
})
.map(|(i, _)| i)
.collect()
}
#[derive(Debug, Clone)]
pub struct BenchmarkMetrics {
pub name: String,
pub mean: f64,
pub std_dev: f64,
pub samples: usize,
}
#[derive(Debug, Clone)]
pub struct Regression {
pub metric: String,
pub baseline: f64,
pub current: f64,
pub change_percent: f64,
}
#[derive(Debug, Clone)]
pub struct RegressionReport {
pub regressions: Vec<Regression>,
pub warnings: Vec<Regression>,
pub improvements: Vec<Regression>,
pub passed: bool,
}
#[derive(Debug, Clone)]
pub struct RegressionDetector {
pub warning_threshold: f64,
pub failure_threshold: f64,
}
impl Default for RegressionDetector {
fn default() -> Self {
Self {
warning_threshold: 0.02, failure_threshold: 0.05, }
}
}
impl RegressionDetector {
pub fn compare(
&self,
baseline: &BenchmarkMetrics,
current: &BenchmarkMetrics,
) -> RegressionReport {
let mut regressions = Vec::new();
let mut warnings = Vec::new();
let mut improvements = Vec::new();
let change = (current.mean - baseline.mean) / baseline.mean;
let item = Regression {
metric: baseline.name.clone(),
baseline: baseline.mean,
current: current.mean,
change_percent: change * 100.0,
};
if change > self.failure_threshold {
regressions.push(item);
} else if change > self.warning_threshold {
warnings.push(item);
} else if change < -self.warning_threshold {
improvements.push(item);
}
RegressionReport {
passed: regressions.is_empty(),
regressions,
warnings,
improvements,
}
}
}
#[derive(Debug, Clone)]
pub struct WelchTTestResult {
pub t_statistic: f64,
pub degrees_of_freedom: f64,
pub p_value: f64,
pub significant: bool,
}
pub fn welch_t_test(sample_a: &[f64], sample_b: &[f64], alpha: f64) -> WelchTTestResult {
let n1 = sample_a.len() as f64;
let n2 = sample_b.len() as f64;
let mean1 = sample_a.iter().sum::<f64>() / n1;
let mean2 = sample_b.iter().sum::<f64>() / n2;
let var1 = if n1 > 1.0 {
sample_a.iter().map(|x| (x - mean1).powi(2)).sum::<f64>() / (n1 - 1.0)
} else {
0.0
};
let var2 = if n2 > 1.0 {
sample_b.iter().map(|x| (x - mean2).powi(2)).sum::<f64>() / (n2 - 1.0)
} else {
0.0
};
let se1 = var1 / n1;
let se2 = var2 / n2;
let se_diff = (se1 + se2).sqrt();
if se_diff < f64::EPSILON {
return WelchTTestResult {
t_statistic: 0.0,
degrees_of_freedom: n1 + n2 - 2.0,
p_value: 1.0,
significant: false,
};
}
let t_stat = (mean1 - mean2) / se_diff;
let df_num = (se1 + se2).powi(2);
let df_denom = if n1 > 1.0 && se1 > f64::EPSILON {
se1.powi(2) / (n1 - 1.0)
} else {
0.0
} + if n2 > 1.0 && se2 > f64::EPSILON {
se2.powi(2) / (n2 - 1.0)
} else {
0.0
};
let df = if df_denom > f64::EPSILON {
df_num / df_denom
} else {
n1 + n2 - 2.0
};
let p_value = approximate_t_pvalue(t_stat.abs(), df);
WelchTTestResult {
t_statistic: t_stat,
degrees_of_freedom: df,
p_value,
significant: p_value < alpha,
}
}
fn approximate_t_pvalue(t_abs: f64, df: f64) -> f64 {
if df > 100.0 {
let z = t_abs;
let p = erfc_approx(z / std::f64::consts::SQRT_2);
return p;
}
let ratio = df / (df + t_abs * t_abs);
incomplete_beta_approx(ratio, df / 2.0, 0.5)
}
fn erfc_approx(x: f64) -> f64 {
let a1 = 0.254_829_592;
let a2 = -0.284_496_736;
let a3 = 1.421_413_741;
let a4 = -1.453_152_027;
let a5 = 1.061_405_429;
let p = 0.327_591_1;
let sign = if x < 0.0 { -1.0 } else { 1.0 };
let x = x.abs();
let t = 1.0 / (1.0 + p * x);
let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
if sign < 0.0 {
2.0 - y
} else {
y
}
}
fn incomplete_beta_approx(x: f64, a: f64, b: f64) -> f64 {
if x < (a + 1.0) / (a + b + 2.0) {
let beta_factor =
gamma_ln(a + b) - gamma_ln(a) - gamma_ln(b) + a * x.ln() + b * (1.0 - x).ln();
let beta_factor = beta_factor.exp();
beta_factor * cf_beta(x, a, b) / a
} else {
1.0 - incomplete_beta_approx(1.0 - x, b, a)
}
}
#[allow(clippy::many_single_char_names)] fn cf_beta(x: f64, a: f64, b: f64) -> f64 {
let max_iter = 100;
let eps = 1e-10;
let tiny = 1e-30;
let qab = a + b;
let qap = a + 1.0;
let qam = a - 1.0;
let mut c = 1.0;
let mut d = 1.0 - qab * x / qap;
if d.abs() < tiny {
d = tiny;
}
d = 1.0 / d;
let mut h = d;
for m in 1..=max_iter {
let m_f = m as f64;
let m2 = 2.0 * m_f;
let aa = m_f * (b - m_f) * x / ((qam + m2) * (a + m2));
d = 1.0 + aa * d;
if d.abs() < tiny {
d = tiny;
}
c = 1.0 + aa / c;
if c.abs() < tiny {
c = tiny;
}
d = 1.0 / d;
h *= d * c;
let aa = -(a + m_f) * (qab + m_f) * x / ((a + m2) * (qap + m2));
d = 1.0 + aa * d;
if d.abs() < tiny {
d = tiny;
}
c = 1.0 + aa / c;
if c.abs() < tiny {
c = tiny;
}
d = 1.0 / d;
let del = d * c;
h *= del;
if (del - 1.0).abs() < eps {
break;
}
}
h
}
#[allow(clippy::excessive_precision)] fn gamma_ln(x: f64) -> f64 {
if x <= 0.0 {
return f64::INFINITY;
}
let g = 7.0;
let c = [
0.999_999_999_999_81,
676.520_368_121_885,
-1_259.139_216_722_403,
771.323_428_777_653,
-176.615_029_162_141,
12.507_343_278_687,
-0.138_571_095_265_72,
9.984_369_578_02e-6,
1.505_632_735_15e-7,
];
let x = x - 1.0;
let mut sum = c[0];
for (i, &coef) in c.iter().enumerate().skip(1) {
sum += coef / (x + i as f64);
}
let t = x + g + 0.5;
0.5 * (2.0 * std::f64::consts::PI).ln() + (x + 0.5) * t.ln() - t + sum.ln()
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LoadTestConfig {
pub concurrency: usize,
pub duration_secs: u64,
pub target_rps: f64,
pub timeout_ms: u64,
pub warmup_secs: u64,
pub latency_threshold_ms: f64,
}
impl Default for LoadTestConfig {
fn default() -> Self {
Self {
concurrency: 10,
duration_secs: 60,
target_rps: 0.0, timeout_ms: 5000,
warmup_secs: 5,
latency_threshold_ms: 500.0, }
}
}
impl LoadTestConfig {
#[must_use]
pub fn for_stress_test() -> Self {
Self {
concurrency: 100,
duration_secs: 300,
target_rps: 0.0,
timeout_ms: 10_000,
warmup_secs: 10,
latency_threshold_ms: 1000.0,
}
}
#[must_use]
pub fn for_latency_test() -> Self {
Self {
concurrency: 1,
duration_secs: 60,
target_rps: 10.0, timeout_ms: 2000,
warmup_secs: 5,
latency_threshold_ms: 200.0,
}
}
#[must_use]
pub fn is_valid(&self) -> bool {
self.concurrency > 0
&& self.duration_secs > 0
&& self.timeout_ms > 0
&& self.latency_threshold_ms > 0.0
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LoadTestResult {
pub total_requests: usize,
pub successful_requests: usize,
pub failed_requests: usize,
pub rps_achieved: f64,
pub latency_p50_ms: f64,
pub latency_p95_ms: f64,
pub latency_p99_ms: f64,
pub latency_max_ms: f64,
pub data_transferred_bytes: u64,
pub duration_secs: f64,
pub error_rate: f64,
pub passed_latency_threshold: bool,
}
impl LoadTestResult {
#[must_use]
pub fn is_passing(&self) -> bool {
self.passed_latency_threshold && self.error_rate < 0.01 }
#[must_use]
pub fn throughput_mbps(&self) -> f64 {
if self.duration_secs > 0.0 {
(self.data_transferred_bytes as f64 / 1_000_000.0) / self.duration_secs
} else {
0.0
}
}
}
#[derive(Debug)]
pub struct LoadTestRunner {
config: LoadTestConfig,
}
impl LoadTestRunner {
#[must_use]
pub fn new(config: LoadTestConfig) -> Self {
Self { config }
}
#[must_use]
pub fn config(&self) -> &LoadTestConfig {
&self.config
}
#[must_use]
pub fn simulate_run(&self) -> LoadTestResult {
let total_requests =
(self.config.concurrency as f64 * self.config.duration_secs as f64 * 10.0) as usize;
let error_count = total_requests / 100; let successful = total_requests - error_count;
let base_latency = 20.0; let concurrency_factor = (self.config.concurrency as f64).ln();
let p50 = base_latency + concurrency_factor * 5.0;
let p95 = p50 * 2.5;
let p99 = p50 * 4.0;
let max = p99 * 2.0;
let duration = self.config.duration_secs as f64;
let rps = if duration > 0.0 {
total_requests as f64 / duration
} else {
0.0
};
LoadTestResult {
total_requests,
successful_requests: successful,
failed_requests: error_count,
rps_achieved: rps,
latency_p50_ms: p50,
latency_p95_ms: p95,
latency_p99_ms: p99,
latency_max_ms: max,
data_transferred_bytes: (total_requests * 1024) as u64, duration_secs: duration,
error_rate: error_count as f64 / total_requests as f64,
passed_latency_threshold: p99 < self.config.latency_threshold_ms,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DistributedBenchConfig {
pub gpu_counts: Vec<usize>,
pub iterations: usize,
pub warmup: usize,
pub model_params: u64,
pub seq_len: usize,
pub batch_size: usize,
pub efficiency_threshold: f64,
}
impl Default for DistributedBenchConfig {
fn default() -> Self {
Self {
gpu_counts: vec![1, 2, 4, 8],
iterations: 100,
warmup: 10,
model_params: 7_000_000_000, seq_len: 2048,
batch_size: 1,
efficiency_threshold: 0.85, }
}
}
impl DistributedBenchConfig {
#[must_use]
pub fn for_small_model() -> Self {
Self {
gpu_counts: vec![1, 2],
iterations: 50,
warmup: 5,
model_params: 125_000_000, seq_len: 512,
batch_size: 1,
efficiency_threshold: 0.80,
}
}
#[must_use]
pub fn for_large_model() -> Self {
Self {
gpu_counts: vec![2, 4, 8],
iterations: 50,
warmup: 5,
model_params: 70_000_000_000, seq_len: 4096,
batch_size: 1,
efficiency_threshold: 0.85,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScalingEfficiencyResult {
pub gpu_count: usize,
pub throughput_tps: f64,
pub latency_p50_ms: f64,
pub latency_p99_ms: f64,
pub efficiency: f64,
pub comm_overhead_ms: f64,
pub theoretical_speedup: f64,
pub achieved_speedup: f64,
}
impl ScalingEfficiencyResult {
#[must_use]
pub fn meets_threshold(&self, threshold: f64) -> bool {
self.efficiency >= threshold
}
#[must_use]
pub fn parallel_fraction(&self) -> f64 {
let n = self.gpu_count as f64;
let s = self.achieved_speedup;
if n <= 1.0 || s <= 1.0 {
return 1.0;
}
(n * s - n) / (n * s - s)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TensorParallelResult {
pub tp_degree: usize,
pub forward_ms: f64,
pub all_reduce_ms: f64,
pub comm_overhead_pct: f64,
pub memory_per_gpu_mb: f64,
pub effective_tflops: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PipelineParallelResult {
pub pp_degree: usize,
pub micro_batches: usize,
pub bubble_ratio: f64,
pub throughput_tps: f64,
pub inter_stage_ms: f64,
pub memory_per_stage_mb: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CommunicationResult {
pub operation: String,
pub data_size_bytes: usize,
pub latency_us: f64,
pub bandwidth_gbps: f64,
pub world_size: usize,
}
#[derive(Debug)]
pub struct DistributedBenchSuite {
config: DistributedBenchConfig,
scaling_results: Vec<ScalingEfficiencyResult>,
tp_results: Vec<TensorParallelResult>,
pp_results: Vec<PipelineParallelResult>,
comm_results: Vec<CommunicationResult>,
}
impl DistributedBenchSuite {
#[must_use]
pub fn new(config: DistributedBenchConfig) -> Self {
Self {
config,
scaling_results: Vec::new(),
tp_results: Vec::new(),
pp_results: Vec::new(),
comm_results: Vec::new(),
}
}
#[must_use]
pub fn config(&self) -> &DistributedBenchConfig {
&self.config
}
pub fn run_scaling_benchmark(&mut self) {
let base_throughput = self.calculate_theoretical_throughput(1);
let base_latency = 1000.0 / base_throughput;
for &gpu_count in &self.config.gpu_counts.clone() {
let parallel_fraction = 0.90;
let theoretical_speedup =
1.0 / ((1.0 - parallel_fraction) + parallel_fraction / gpu_count as f64);
let comm_overhead_factor = 1.0 + 0.05 * (gpu_count - 1) as f64;
let achieved_speedup = theoretical_speedup / comm_overhead_factor;
let throughput = base_throughput * achieved_speedup;
let latency_p50 = base_latency / achieved_speedup;
let latency_p99 = latency_p50 * 1.5;
let efficiency = if gpu_count > 1 {
achieved_speedup / gpu_count as f64
} else {
1.0
};
let comm_overhead_ms = if gpu_count > 1 {
(theoretical_speedup - achieved_speedup) * base_latency
} else {
0.0
};
self.scaling_results.push(ScalingEfficiencyResult {
gpu_count,
throughput_tps: throughput,
latency_p50_ms: latency_p50,
latency_p99_ms: latency_p99,
efficiency,
comm_overhead_ms,
theoretical_speedup,
achieved_speedup,
});
}
}
pub fn run_tensor_parallel_benchmark(&mut self) {
let base_flops = self.calculate_model_flops();
for tp_degree in [1, 2, 4, 8] {
if tp_degree > self.config.gpu_counts.iter().max().copied().unwrap_or(1) {
continue;
}
let base_forward_ms = 50.0; let forward_ms =
base_forward_ms / tp_degree as f64 * (self.config.model_params as f64 / 7e9);
let tensor_size_kb = (self.config.model_params / tp_degree as u64) as f64 / 256.0; let all_reduce_ms = if tp_degree > 1 {
(5.0 + 0.1 * tensor_size_kb) / 1000.0
} else {
0.0
};
let total_ms = forward_ms + all_reduce_ms;
let comm_overhead_pct = if total_ms > 0.0 {
all_reduce_ms / total_ms * 100.0
} else {
0.0
};
let total_memory_mb = self.config.model_params as f64 * 2.0 / 1e6; let memory_per_gpu_mb = total_memory_mb / tp_degree as f64;
let effective_tflops = if total_ms > 0.0 {
base_flops / (total_ms / 1000.0) / 1e12
} else {
0.0
};
self.tp_results.push(TensorParallelResult {
tp_degree,
forward_ms,
all_reduce_ms,
comm_overhead_pct,
memory_per_gpu_mb,
effective_tflops,
});
}
}
pub fn run_pipeline_parallel_benchmark(&mut self) {
let base_throughput = self.calculate_theoretical_throughput(1);
for pp_degree in [1, 2, 4, 8] {
if pp_degree > self.config.gpu_counts.iter().max().copied().unwrap_or(1) {
continue;
}
let micro_batches = pp_degree * 4;
let bubble_ratio = if pp_degree > 1 {
(pp_degree - 1) as f64 / (pp_degree - 1 + micro_batches) as f64
} else {
0.0
};
let efficiency = 1.0 - bubble_ratio;
let throughput_tps = base_throughput * pp_degree as f64 * efficiency;
let inter_stage_ms = if pp_degree > 1 { 0.5 } else { 0.0 };
let total_memory_mb = self.config.model_params as f64 * 2.0 / 1e6;
let memory_per_stage_mb = total_memory_mb / pp_degree as f64;
self.pp_results.push(PipelineParallelResult {
pp_degree,
micro_batches,
bubble_ratio,
throughput_tps,
inter_stage_ms,
memory_per_stage_mb,
});
}
}
pub fn run_communication_benchmark(&mut self) {
let world_size = self.config.gpu_counts.iter().max().copied().unwrap_or(1);
let data_sizes: Vec<usize> = vec![
1024, 1024 * 1024, 10 * 1024 * 1024, 100 * 1024 * 1024, ];
for data_size in data_sizes {
let alpha_us = 3.0;
let beta_us_per_kb = 0.08;
let size_kb = data_size as f64 / 1024.0;
let latency_us = (world_size as f64).ln() * (alpha_us + beta_us_per_kb * size_kb);
let bandwidth_gbps = if latency_us > 0.0 {
(data_size as f64 * 8.0) / (latency_us * 1000.0) } else {
0.0
};
self.comm_results.push(CommunicationResult {
operation: "all_reduce".to_string(),
data_size_bytes: data_size,
latency_us,
bandwidth_gbps,
world_size,
});
let all_gather_latency = latency_us * 0.8; let all_gather_bw = bandwidth_gbps * 1.2;
self.comm_results.push(CommunicationResult {
operation: "all_gather".to_string(),
data_size_bytes: data_size,
latency_us: all_gather_latency,
bandwidth_gbps: all_gather_bw,
world_size,
});
}
}
pub fn run_all(&mut self) {
self.run_scaling_benchmark();
self.run_tensor_parallel_benchmark();
self.run_pipeline_parallel_benchmark();
self.run_communication_benchmark();
}
#[must_use]
pub fn scaling_results(&self) -> &[ScalingEfficiencyResult] {
&self.scaling_results
}
#[must_use]
pub fn tp_results(&self) -> &[TensorParallelResult] {
&self.tp_results
}
#[must_use]
pub fn pp_results(&self) -> &[PipelineParallelResult] {
&self.pp_results
}
#[must_use]
pub fn comm_results(&self) -> &[CommunicationResult] {
&self.comm_results
}
#[must_use]
pub fn all_meet_efficiency_threshold(&self) -> bool {
self.scaling_results
.iter()
.all(|r| r.meets_threshold(self.config.efficiency_threshold))
}
#[must_use]
pub fn summary(&self) -> DistributedBenchSummary {
let max_scaling = self
.scaling_results
.iter()
.map(|r| r.gpu_count)
.max()
.unwrap_or(1);
let max_efficiency = self
.scaling_results
.iter()
.map(|r| r.efficiency)
.fold(0.0_f64, f64::max);
let min_efficiency = self
.scaling_results
.iter()
.map(|r| r.efficiency)
.fold(1.0_f64, f64::min);
let max_throughput = self
.scaling_results
.iter()
.map(|r| r.throughput_tps)
.fold(0.0_f64, f64::max);
let avg_tp_overhead = if self.tp_results.is_empty() {
0.0
} else {
self.tp_results
.iter()
.map(|r| r.comm_overhead_pct)
.sum::<f64>()
/ self.tp_results.len() as f64
};
let avg_pp_bubble = if self.pp_results.is_empty() {
0.0
} else {
self.pp_results.iter().map(|r| r.bubble_ratio).sum::<f64>()
/ self.pp_results.len() as f64
};
DistributedBenchSummary {
max_scaling,
max_efficiency,
min_efficiency,
max_throughput_tps: max_throughput,
avg_tp_comm_overhead_pct: avg_tp_overhead,
avg_pp_bubble_ratio: avg_pp_bubble,
meets_threshold: self.all_meet_efficiency_threshold(),
}
}
fn calculate_theoretical_throughput(&self, _gpu_count: usize) -> f64 {
let base_tps = 100.0 * (7e9 / self.config.model_params as f64);
base_tps * (self.config.batch_size as f64)
}
fn calculate_model_flops(&self) -> f64 {
2.0 * self.config.model_params as f64 * self.config.seq_len as f64
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DistributedBenchSummary {
pub max_scaling: usize,
pub max_efficiency: f64,
pub min_efficiency: f64,
pub max_throughput_tps: f64,
pub avg_tp_comm_overhead_pct: f64,
pub avg_pp_bubble_ratio: f64,
pub meets_threshold: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ComputeBackendType {
Cpu,
Wgpu,
Cuda,
}
impl std::fmt::Display for ComputeBackendType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Cpu => write!(f, "cpu"),
Self::Wgpu => write!(f, "wgpu"),
Self::Cuda => write!(f, "cuda"),
}
}
}
impl ComputeBackendType {
#[must_use]
pub fn parse(s: &str) -> Option<Self> {
match s.to_lowercase().as_str() {
"cpu" => Some(Self::Cpu),
"wgpu" | "gpu" => Some(Self::Wgpu),
"cuda" | "nvidia" => Some(Self::Cuda),
_ => None,
}
}
#[must_use]
pub fn all() -> Vec<Self> {
vec![Self::Cpu, Self::Wgpu, Self::Cuda]
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MatrixBenchmarkEntry {
pub runtime: RuntimeType,
pub backend: ComputeBackendType,
pub model: String,
pub available: bool,
pub p50_latency_ms: f64,
pub p99_latency_ms: f64,
pub throughput_tps: f64,
pub cold_start_ms: f64,
pub samples: usize,
pub cv_at_stop: f64,
pub notes: String,
}
impl Default for MatrixBenchmarkEntry {
fn default() -> Self {
Self {
runtime: RuntimeType::Realizar,
backend: ComputeBackendType::Cpu,
model: String::new(),
available: false,
p50_latency_ms: 0.0,
p99_latency_ms: 0.0,
throughput_tps: 0.0,
cold_start_ms: 0.0,
samples: 0,
cv_at_stop: 0.0,
notes: String::new(),
}
}
}
impl MatrixBenchmarkEntry {
#[must_use]
pub fn unavailable(runtime: RuntimeType, backend: ComputeBackendType) -> Self {
Self {
runtime,
backend,
available: false,
notes: "Backend not available".to_string(),
..Default::default()
}
}
#[must_use]
pub fn from_samples(
runtime: RuntimeType,
backend: ComputeBackendType,
model: &str,
latencies_ms: &[f64],
throughputs_tps: &[f64],
cold_start_ms: f64,
) -> Self {
let samples = latencies_ms.len();
if samples == 0 {
return Self::unavailable(runtime, backend);
}
let p50_latency = percentile(latencies_ms, 50.0);
let p99_latency = percentile(latencies_ms, 99.0);
let throughput = if throughputs_tps.is_empty() {
0.0
} else {
throughputs_tps.iter().sum::<f64>() / throughputs_tps.len() as f64
};
let cv = compute_cv(latencies_ms);
Self {
runtime,
backend,
model: model.to_string(),
available: true,
p50_latency_ms: p50_latency,
p99_latency_ms: p99_latency,
throughput_tps: throughput,
cold_start_ms,
samples,
cv_at_stop: cv,
notes: String::new(),
}
}
#[must_use]
pub fn with_notes(mut self, notes: &str) -> Self {
self.notes = notes.to_string();
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkMatrix {
pub version: String,
pub timestamp: String,
pub model: String,
pub hardware: HardwareSpec,
pub methodology: String,
pub cv_threshold: f64,
pub entries: Vec<MatrixBenchmarkEntry>,
}
impl BenchmarkMatrix {
#[must_use]
pub fn new(model: &str, hardware: HardwareSpec) -> Self {
Self {
version: "1.1".to_string(),
timestamp: chrono_timestamp(),
model: model.to_string(),
hardware,
methodology: "CV-based stopping (Hoefler & Belli SC'15)".to_string(),
cv_threshold: 0.05,
entries: Vec::new(),
}
}
pub fn add_entry(&mut self, entry: MatrixBenchmarkEntry) {
self.entries
.retain(|e| e.runtime != entry.runtime || e.backend != entry.backend);
self.entries.push(entry);
}
#[must_use]
pub fn get_entry(
&self,
runtime: RuntimeType,
backend: ComputeBackendType,
) -> Option<&MatrixBenchmarkEntry> {
self.entries
.iter()
.find(|e| e.runtime == runtime && e.backend == backend)
}
#[must_use]
pub fn entries_for_runtime(&self, runtime: RuntimeType) -> Vec<&MatrixBenchmarkEntry> {
self.entries
.iter()
.filter(|e| e.runtime == runtime)
.collect()
}
#[must_use]
pub fn entries_for_backend(&self, backend: ComputeBackendType) -> Vec<&MatrixBenchmarkEntry> {
self.entries
.iter()
.filter(|e| e.backend == backend)
.collect()
}
#[must_use]
pub fn fastest_for_backend(
&self,
backend: ComputeBackendType,
) -> Option<&MatrixBenchmarkEntry> {
self.entries_for_backend(backend)
.into_iter()
.filter(|e| e.available)
.min_by(|a, b| a.p50_latency_ms.partial_cmp(&b.p50_latency_ms).unwrap())
}
#[must_use]
pub fn highest_throughput_for_backend(
&self,
backend: ComputeBackendType,
) -> Option<&MatrixBenchmarkEntry> {
self.entries_for_backend(backend)
.into_iter()
.filter(|e| e.available)
.max_by(|a, b| a.throughput_tps.partial_cmp(&b.throughput_tps).unwrap())
}
#[must_use]
pub fn to_markdown_table(&self) -> String {
let mut table = String::new();
table.push_str("| Runtime | Backend | p50 Latency | p99 Latency | Throughput | Cold Start | Samples | CV |\n");
table.push_str("|---------|---------|-------------|-------------|------------|------------|---------|----|\n");
let mut sorted_entries = self.entries.clone();
sorted_entries.sort_by(|a, b| {
let runtime_cmp = format!("{:?}", a.runtime).cmp(&format!("{:?}", b.runtime));
if runtime_cmp == std::cmp::Ordering::Equal {
format!("{}", a.backend).cmp(&format!("{}", b.backend))
} else {
runtime_cmp
}
});
for entry in &sorted_entries {
if entry.available {
let _ = writeln!(
table,
"| **{}** | {} | {:.1}ms | {:.1}ms | {:.1} tok/s | {:.0}ms | {} | {:.3} |",
format!("{:?}", entry.runtime).to_lowercase(),
entry.backend,
entry.p50_latency_ms,
entry.p99_latency_ms,
entry.throughput_tps,
entry.cold_start_ms,
entry.samples,
entry.cv_at_stop,
);
} else {
let _ = writeln!(
table,
"| {} | {} | - | - | - | - | - | - |",
format!("{:?}", entry.runtime).to_lowercase(),
entry.backend,
);
}
}
table
}
pub fn to_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string_pretty(self)
}
pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
serde_json::from_str(json)
}
}
#[derive(Debug, Clone)]
pub struct MatrixBenchmarkConfig {
pub runtimes: Vec<RuntimeType>,
pub backends: Vec<ComputeBackendType>,
pub model_path: String,
pub prompt: String,
pub max_tokens: usize,
pub cv_threshold: f64,
pub min_samples: usize,
pub max_samples: usize,
pub warmup_iterations: usize,
}
impl Default for MatrixBenchmarkConfig {
fn default() -> Self {
Self {
runtimes: vec![
RuntimeType::Realizar,
RuntimeType::LlamaCpp,
RuntimeType::Ollama,
],
backends: vec![ComputeBackendType::Cpu, ComputeBackendType::Wgpu],
model_path: String::new(),
prompt: "Explain machine learning in one sentence.".to_string(),
max_tokens: 50,
cv_threshold: 0.05,
min_samples: 30,
max_samples: 200,
warmup_iterations: 5,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BackendSummary {
pub backend: ComputeBackendType,
pub available_runtimes: usize,
pub fastest_runtime: Option<String>,
pub fastest_p50_ms: f64,
pub highest_throughput_runtime: Option<String>,
pub highest_throughput_tps: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MatrixSummary {
pub total_entries: usize,
pub available_entries: usize,
pub backend_summaries: Vec<BackendSummary>,
pub overall_fastest: Option<(String, String)>,
pub overall_highest_throughput: Option<(String, String)>,
}
impl BenchmarkMatrix {
#[must_use]
pub fn summary(&self) -> MatrixSummary {
let total_entries = self.entries.len();
let available_entries = self.entries.iter().filter(|e| e.available).count();
let mut backend_summaries = Vec::new();
for backend in ComputeBackendType::all() {
let entries: Vec<_> = self.entries_for_backend(backend);
let available: Vec<_> = entries.iter().filter(|e| e.available).collect();
let fastest = available
.iter()
.min_by(|a, b| a.p50_latency_ms.partial_cmp(&b.p50_latency_ms).unwrap());
let highest_tp = available
.iter()
.max_by(|a, b| a.throughput_tps.partial_cmp(&b.throughput_tps).unwrap());
backend_summaries.push(BackendSummary {
backend,
available_runtimes: available.len(),
fastest_runtime: fastest.map(|e| format!("{:?}", e.runtime).to_lowercase()),
fastest_p50_ms: fastest.map_or(0.0, |e| e.p50_latency_ms),
highest_throughput_runtime: highest_tp
.map(|e| format!("{:?}", e.runtime).to_lowercase()),
highest_throughput_tps: highest_tp.map_or(0.0, |e| e.throughput_tps),
});
}
let available = self.entries.iter().filter(|e| e.available);
let overall_fastest = available
.clone()
.min_by(|a, b| a.p50_latency_ms.partial_cmp(&b.p50_latency_ms).unwrap())
.map(|e| {
(
format!("{:?}", e.runtime).to_lowercase(),
e.backend.to_string(),
)
});
let overall_highest_throughput = available
.max_by(|a, b| a.throughput_tps.partial_cmp(&b.throughput_tps).unwrap())
.map(|e| {
(
format!("{:?}", e.runtime).to_lowercase(),
e.backend.to_string(),
)
});
MatrixSummary {
total_entries,
available_entries,
backend_summaries,
overall_fastest,
overall_highest_throughput,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuParityBenchmark {
pub model_path: String,
pub prompt: String,
pub max_tokens: usize,
pub ollama_endpoint: String,
pub warmup_iterations: usize,
pub measurement_iterations: usize,
pub target_cv: f64,
}
impl Default for GpuParityBenchmark {
fn default() -> Self {
Self {
model_path: String::new(),
prompt: "The quick brown fox".to_string(),
max_tokens: 32,
ollama_endpoint: "http://localhost:11434".to_string(),
warmup_iterations: 3,
measurement_iterations: 10,
target_cv: 0.05,
}
}
}
impl GpuParityBenchmark {
#[must_use]
pub fn new(model_path: impl Into<String>) -> Self {
Self {
model_path: model_path.into(),
..Default::default()
}
}
#[must_use]
pub fn with_prompt(mut self, prompt: impl Into<String>) -> Self {
self.prompt = prompt.into();
self
}
#[must_use]
pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
self.max_tokens = max_tokens;
self
}
#[must_use]
pub fn with_ollama_endpoint(mut self, endpoint: impl Into<String>) -> Self {
self.ollama_endpoint = endpoint.into();
self
}
#[must_use]
pub fn with_warmup(mut self, warmup: usize) -> Self {
self.warmup_iterations = warmup;
self
}
#[must_use]
pub fn with_iterations(mut self, iterations: usize) -> Self {
self.measurement_iterations = iterations;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GpuParityResult {
pub realizar_gpu_tps: f64,
pub ollama_tps: f64,
pub gap_ratio: f64,
pub cv: f64,
pub gpu_device: String,
pub vram_mb: u64,
pub realizar_p50_ms: f64,
pub ollama_p50_ms: f64,
}
impl GpuParityResult {
#[must_use]
pub fn new(
realizar_gpu_tps: f64,
ollama_tps: f64,
cv: f64,
gpu_device: impl Into<String>,
vram_mb: u64,
) -> Self {
let gap_ratio = if realizar_gpu_tps > 0.0 {
ollama_tps / realizar_gpu_tps
} else {
f64::INFINITY
};
Self {
realizar_gpu_tps,
ollama_tps,
gap_ratio,
cv,
gpu_device: gpu_device.into(),
vram_mb,
realizar_p50_ms: 0.0,
ollama_p50_ms: 0.0,
}
}
#[must_use]
pub fn achieves_m2_parity(&self) -> bool {
self.gap_ratio <= 2.0
}
#[must_use]
pub fn achieves_m4_parity(&self) -> bool {
self.gap_ratio <= 1.25
}
#[must_use]
pub fn gpu_faster_than_cpu(&self) -> bool {
self.realizar_gpu_tps > 5.0
}
#[must_use]
pub fn measurements_stable(&self) -> bool {
self.cv < 0.05
}
#[must_use]
pub fn cpu_speedup(&self) -> f64 {
self.realizar_gpu_tps / 5.0 }
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GapAnalysis {
pub claimed_gap: f64,
pub measured_gap: f64,
pub p_value: f64,
pub ci_95_lower: f64,
pub ci_95_upper: f64,
pub popper_score: f64,
pub claims: Vec<FalsifiableClaim>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FalsifiableClaim {
pub id: String,
pub description: String,
pub expected: f64,
pub threshold: f64,
pub measured: f64,
pub verified: bool,
}
impl FalsifiableClaim {
#[must_use]
pub fn new(
id: impl Into<String>,
description: impl Into<String>,
expected: f64,
threshold: f64,
) -> Self {
Self {
id: id.into(),
description: description.into(),
expected,
threshold,
measured: 0.0,
verified: false,
}
}
#[must_use]
pub fn evaluate(mut self, measured: f64) -> Self {
self.measured = measured;
self.verified = measured >= self.threshold;
self
}
}
impl GapAnalysis {
#[must_use]
pub fn new(claimed_gap: f64, measured_gap: f64) -> Self {
Self {
claimed_gap,
measured_gap,
p_value: 0.0,
ci_95_lower: 0.0,
ci_95_upper: 0.0,
popper_score: 0.0,
claims: Vec::new(),
}
}
#[must_use]
pub fn with_statistics(mut self, p_value: f64, ci_lower: f64, ci_upper: f64) -> Self {
self.p_value = p_value;
self.ci_95_lower = ci_lower;
self.ci_95_upper = ci_upper;
self
}
pub fn calculate_popper_score(&mut self) {
if self.claims.is_empty() {
self.popper_score = 0.0;
return;
}
let verified_count = self.claims.iter().filter(|c| c.verified).count();
self.popper_score = (verified_count as f64 / self.claims.len() as f64) * 100.0;
}
pub fn add_claim(&mut self, claim: FalsifiableClaim) {
self.claims.push(claim);
}
#[must_use]
pub fn claim_verified(&self) -> bool {
self.measured_gap >= self.ci_95_lower && self.measured_gap <= self.ci_95_upper
}
#[must_use]
pub fn with_default_claims(mut self, realizar_gpu_tps: f64) -> Self {
self.claims.push(
FalsifiableClaim::new("IMP-800c-1", "GPU faster than CPU SIMD (>5x)", 5.0, 25.0)
.evaluate(realizar_gpu_tps),
);
self.claims.push(
FalsifiableClaim::new("IMP-800c-2", "GPU within 10x of Ollama", 10.0, 24.0)
.evaluate(realizar_gpu_tps),
);
self.claims.push(
FalsifiableClaim::new("IMP-800c-3", "GPU within 2x of Ollama (M2)", 2.0, 120.0)
.evaluate(realizar_gpu_tps),
);
self.claims.push(
FalsifiableClaim::new("IMP-800c-4", "GPU at parity with Ollama (M4)", 1.25, 192.0)
.evaluate(realizar_gpu_tps),
);
self.calculate_popper_score();
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptimizedGemmConfig {
pub tile_size: u32,
pub reg_block: u32,
pub use_tensor_cores: bool,
pub vector_width: u32,
pub k_unroll: u32,
pub double_buffer: bool,
}
impl Default for OptimizedGemmConfig {
fn default() -> Self {
Self {
tile_size: 32,
reg_block: 4,
use_tensor_cores: false,
vector_width: 4,
k_unroll: 4,
double_buffer: true,
}
}
}
impl OptimizedGemmConfig {
#[must_use]
pub fn small() -> Self {
Self {
tile_size: 16,
reg_block: 2,
use_tensor_cores: false,
vector_width: 4,
k_unroll: 4,
double_buffer: false,
}
}
#[must_use]
pub fn large() -> Self {
Self {
tile_size: 64,
reg_block: 8,
use_tensor_cores: false,
vector_width: 4,
k_unroll: 8,
double_buffer: true,
}
}
#[must_use]
pub fn shared_memory_bytes(&self) -> u32 {
let tile_bytes = self.tile_size * self.tile_size * 4;
if self.double_buffer {
tile_bytes * 4 } else {
tile_bytes * 2 }
}
#[must_use]
pub fn threads_per_block(&self) -> u32 {
let threads_per_dim = self.tile_size / self.reg_block;
threads_per_dim * threads_per_dim
}
#[must_use]
pub fn registers_per_thread(&self) -> u32 {
self.reg_block * self.reg_block
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GemmPerformanceResult {
pub m: u32,
pub n: u32,
pub k: u32,
pub time_ms: f64,
pub gflops: f64,
pub bandwidth_gbs: f64,
pub efficiency: f64,
}
impl GemmPerformanceResult {
#[must_use]
pub fn new(m: u32, n: u32, k: u32, time_ms: f64) -> Self {
let ops = 2.0 * f64::from(m) * f64::from(n) * f64::from(k);
let gflops = ops / (time_ms * 1e6);
let bytes = (f64::from(m) * f64::from(k)
+ f64::from(k) * f64::from(n)
+ f64::from(m) * f64::from(n))
* 4.0;
let bandwidth_gbs = bytes / (time_ms * 1e6);
Self {
m,
n,
k,
time_ms,
gflops,
bandwidth_gbs,
efficiency: 0.0, }
}
#[must_use]
pub fn with_peak(mut self, peak_gflops: f64) -> Self {
self.efficiency = (self.gflops / peak_gflops) * 100.0;
self
}
#[must_use]
pub fn improved_by(&self, baseline_gflops: f64, factor: f64) -> bool {
self.gflops >= baseline_gflops * factor
}
}
#[derive(Debug)]
pub struct OptimizedGemmBenchmark {
pub config: OptimizedGemmConfig,
pub warmup_iterations: usize,
pub measurement_iterations: usize,
pub target_cv: f64,
}
impl Default for OptimizedGemmBenchmark {
fn default() -> Self {
Self {
config: OptimizedGemmConfig::default(),
warmup_iterations: 5,
measurement_iterations: 20,
target_cv: 0.05,
}
}
}
impl OptimizedGemmBenchmark {
#[must_use]
pub fn with_config(config: OptimizedGemmConfig) -> Self {
Self {
config,
..Default::default()
}
}
#[must_use]
pub fn expected_improvement(&self) -> f64 {
let mut improvement = 1.0;
improvement *= 2.0;
if self.config.reg_block >= 4 {
improvement *= 1.5;
}
if self.config.vector_width >= 4 {
improvement *= 1.3;
}
if self.config.double_buffer {
improvement *= 1.2;
}
improvement
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FusedOpType {
GemmBiasActivation,
LayerNormLinear,
FusedAttention,
FusedFfn,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FusedOpSpec {
pub op_type: FusedOpType,
pub input_dims: Vec<u32>,
pub output_dims: Vec<u32>,
pub activation: Option<String>,
pub fused_launches: u32,
pub unfused_launches: u32,
}
impl FusedOpSpec {
#[must_use]
pub fn launch_reduction(&self) -> f64 {
f64::from(self.unfused_launches) / f64::from(self.fused_launches)
}
#[must_use]
pub fn achieves_target_reduction(&self) -> bool {
self.launch_reduction() >= 2.0
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FlashAttentionConfig {
pub block_size_q: u32,
pub block_size_kv: u32,
pub head_dim: u32,
pub num_heads: u32,
pub causal: bool,
pub scale: f32,
}
impl FlashAttentionConfig {
#[must_use]
pub fn phi2() -> Self {
Self {
block_size_q: 64,
block_size_kv: 64,
head_dim: 80, num_heads: 32,
causal: true,
scale: 1.0 / (80.0_f32).sqrt(),
}
}
#[must_use]
pub fn memory_comparison(&self, seq_len: u32) -> (u64, u64) {
let naive_bytes = u64::from(seq_len) * u64::from(seq_len) * 4;
let flash_bytes = u64::from(self.block_size_q) * u64::from(self.block_size_kv) * 4 * 2;
(naive_bytes, flash_bytes)
}
#[must_use]
pub fn memory_savings(&self, seq_len: u32) -> f64 {
let (naive, flash) = self.memory_comparison(seq_len);
naive as f64 / flash as f64
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MemoryPoolConfig {
pub initial_size: usize,
pub max_size: usize,
pub size_classes: Vec<usize>,
pub use_pinned_memory: bool,
pub async_transfers: bool,
}
impl Default for MemoryPoolConfig {
fn default() -> Self {
Self {
initial_size: 256 * 1024 * 1024, max_size: 2 * 1024 * 1024 * 1024, size_classes: vec![
4096, 16384, 65536, 262_144, 1_048_576, 4_194_304, 16_777_216, 67_108_864, 268_435_456, ],
use_pinned_memory: true,
async_transfers: true,
}
}
}
impl MemoryPoolConfig {
#[must_use]
pub fn find_size_class(&self, requested: usize) -> Option<usize> {
self.size_classes
.iter()
.copied()
.find(|&size| size >= requested)
}
#[must_use]
pub fn expected_bandwidth_improvement(&self) -> f64 {
if self.use_pinned_memory {
2.4 } else {
1.0
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Imp900Result {
pub baseline_tps: f64,
pub optimized_tps: f64,
pub gemm_improvement: f64,
pub fusion_improvement: f64,
pub flash_attention_improvement: f64,
pub memory_improvement: f64,
pub gap_ratio: f64,
pub milestone: Option<String>,
}
impl Imp900Result {
#[must_use]
pub fn from_baseline(baseline_tps: f64) -> Self {
Self {
baseline_tps,
optimized_tps: baseline_tps,
gemm_improvement: 1.0,
fusion_improvement: 1.0,
flash_attention_improvement: 1.0,
memory_improvement: 1.0,
gap_ratio: 240.0 / baseline_tps,
milestone: None,
}
}
#[must_use]
pub fn with_gemm_improvement(mut self, factor: f64) -> Self {
self.gemm_improvement = factor;
self.recalculate();
self
}
#[must_use]
pub fn with_fusion_improvement(mut self, factor: f64) -> Self {
self.fusion_improvement = factor;
self.recalculate();
self
}
#[must_use]
pub fn with_flash_attention_improvement(mut self, factor: f64) -> Self {
self.flash_attention_improvement = factor;
self.recalculate();
self
}
#[must_use]
pub fn with_memory_improvement(mut self, factor: f64) -> Self {
self.memory_improvement = factor;
self.recalculate();
self
}
fn recalculate(&mut self) {
let total_improvement = self.gemm_improvement
* self.fusion_improvement
* self.flash_attention_improvement
* self.memory_improvement;
self.optimized_tps = self.baseline_tps * total_improvement;
self.gap_ratio = 240.0 / self.optimized_tps;
self.milestone = if self.gap_ratio <= 1.25 {
Some("M4".to_string()) } else if self.gap_ratio <= 2.0 {
Some("M3".to_string()) } else if self.gap_ratio <= 5.0 {
Some("M2".to_string()) } else {
None
};
}
#[must_use]
pub fn achieves_m3(&self) -> bool {
self.optimized_tps >= 48.0 && self.gap_ratio <= 5.0
}
#[must_use]
pub fn achieves_m4(&self) -> bool {
self.optimized_tps >= 192.0 && self.gap_ratio <= 1.25
}
#[must_use]
pub fn total_improvement(&self) -> f64 {
self.optimized_tps / self.baseline_tps
}
}
#[cfg(all(test, feature = "heavy-tests"))]
mod tests {
use super::*;
#[test]
fn test_dynamic_sampler_continues_until_min_samples() {
let mut dyn_sampler = DynamicSampler::new(100, 10_000, 0.05);
let data: Vec<f64> = (0..50).map(|i| i as f64).collect();
assert!(dyn_sampler.should_continue(&data));
}
#[test]
fn test_dynamic_sampler_stops_at_max_samples() {
let mut dyn_sampler = DynamicSampler::new(10, 100, 0.05);
let data: Vec<f64> = (0..100).map(|i| (i % 50) as f64 * 10.0).collect();
assert!(!dyn_sampler.should_continue(&data));
}
#[test]
fn test_dynamic_sampler_stops_when_cv_stable() {
let mut dyn_sampler = DynamicSampler::new(10, 10_000, 0.05);
dyn_sampler.stability_count = 1;
let data: Vec<f64> = vec![100.0; 100];
assert!(!dyn_sampler.should_continue(&data));
}
#[test]
fn test_dynamic_sampler_requires_stability_streak() {
let mut dyn_sampler = DynamicSampler::new(10, 10_000, 0.05);
dyn_sampler.stability_count = 3;
let data: Vec<f64> = vec![100.0; 100];
assert!(dyn_sampler.should_continue(&data));
assert!(dyn_sampler.should_continue(&data));
assert!(!dyn_sampler.should_continue(&data));
}
#[test]
fn test_dynamic_sampler_reset() {
let mut sampler = DynamicSampler::new(10, 10_000, 0.05);
sampler.stable_streak = 5;
sampler.reset();
assert_eq!(sampler.stable_streak, 0);
}
#[test]
fn test_compute_cv_constant_values() {
let data = vec![100.0; 50];
let cv = compute_cv(&data);
assert!(cv.abs() < 1e-10, "CV of constant values should be ~0");
}
#[test]
fn test_compute_cv_varied_values() {
let data = vec![10.0, 20.0, 30.0, 40.0, 50.0];
let cv = compute_cv(&data);
assert!(cv > 0.5 && cv < 0.6, "CV should be ~0.527, got {cv}");
}
#[test]
fn test_compute_cv_empty_data() {
let data: Vec<f64> = vec![];
let cv = compute_cv(&data);
assert!(cv.is_infinite());
}
#[test]
fn test_thermal_guard_valid_low_variance() {
let guard = ThermalGuard::default();
let temps = vec![75.0, 75.5, 74.8, 75.2, 75.1];
assert_eq!(guard.validate_run(&temps), ThermalValidity::Valid);
}
#[test]
fn test_thermal_guard_invalid_high_variance() {
let guard = ThermalGuard::default();
let temps = vec![70.0, 75.0, 80.0, 72.0, 78.0];
match guard.validate_run(&temps) {
ThermalValidity::Invalid(msg) => {
assert!(msg.contains("exceeds threshold"));
},
ThermalValidity::Valid => panic!("Expected Invalid"),
}
}
#[test]
fn test_thermal_guard_empty_temps() {
let guard = ThermalGuard::default();
assert_eq!(guard.validate_run(&[]), ThermalValidity::Valid);
}
#[test]
fn test_thermal_guard_max_temp() {
let guard = ThermalGuard::default();
let temps = vec![70.0, 75.0, 85.0, 72.0];
assert_eq!(guard.max_temp(&temps), 85.0);
}
#[test]
fn test_kv_cache_metrics_no_waste() {
let metrics = KvCacheMetrics::new(1000, 1000);
assert_eq!(metrics.fragmentation_pct, 0.0);
assert!(metrics.is_acceptable(10.0));
}
#[test]
fn test_kv_cache_metrics_with_waste() {
let metrics = KvCacheMetrics::new(1000, 800);
assert!((metrics.fragmentation_pct - 20.0).abs() < 0.01);
assert!(!metrics.is_acceptable(10.0));
assert!(metrics.is_acceptable(25.0));
}
#[test]
fn test_kv_cache_metrics_zero_allocated() {
let metrics = KvCacheMetrics::new(0, 0);
assert_eq!(metrics.fragmentation_pct, 0.0);
}
#[test]
fn test_kv_cache_metrics_mb_conversion() {
let metrics = KvCacheMetrics::new(1024 * 1024 * 100, 1024 * 1024 * 80);
assert!((metrics.allocated_mb() - 100.0).abs() < 0.01);
assert!((metrics.used_mb() - 80.0).abs() < 0.01);
}
#[test]
fn test_energy_metrics_joules_per_token() {
let metrics = EnergyMetrics::new(100.0, 10.0, 50.0, 1000);
assert!((metrics.joules_per_token() - 0.1).abs() < 0.001);
}
#[test]
fn test_energy_metrics_zero_tokens() {
let metrics = EnergyMetrics::new(100.0, 10.0, 50.0, 0);
assert_eq!(metrics.joules_per_token(), 0.0);
}
#[test]
fn test_energy_metrics_tokens_per_joule() {
let metrics = EnergyMetrics::new(100.0, 10.0, 50.0, 1000);
assert!((metrics.tokens_per_joule() - 10.0).abs() < 0.001);
}
#[test]
fn test_itl_metrics_from_measurements() {
let itl = vec![10.0, 12.0, 11.0, 15.0, 13.0, 14.0, 11.0, 12.0, 13.0, 10.0];
let metrics = ItlMetrics::from_measurements(&itl);
assert!(metrics.median_ms > 11.0 && metrics.median_ms < 13.0);
assert!(metrics.std_dev_ms < 5.0);
assert!(metrics.p99_ms >= 14.0);
}
#[test]
fn test_itl_metrics_empty() {
let metrics = ItlMetrics::from_measurements(&[]);
assert_eq!(metrics.median_ms, 0.0);
assert_eq!(metrics.std_dev_ms, 0.0);
}
#[test]
fn test_itl_metrics_low_jitter() {
let itl = vec![10.0; 100];
let metrics = ItlMetrics::from_measurements(&itl);
assert!(metrics.is_low_jitter(1.0));
}
#[test]
fn test_itl_metrics_high_jitter() {
let itl: Vec<f64> = (0..100).map(|i| i as f64).collect();
let metrics = ItlMetrics::from_measurements(&itl);
assert!(!metrics.is_low_jitter(5.0));
}
#[test]
fn test_kl_divergence_identical_distributions() {
let logits = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let result = validate_quantization_quality(&logits, &logits, 0.01);
match result {
QualityResult::Pass { kl_divergence } => {
assert!(kl_divergence < 1e-10, "KL should be ~0 for identical");
},
QualityResult::Fail { .. } => panic!("Expected Pass for identical"),
}
}
#[test]
fn test_kl_divergence_slightly_different() {
let fp32 = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let quant = vec![1.01, 2.01, 3.01, 4.01, 5.01];
let result = validate_quantization_quality(&fp32, &quant, 0.01);
match result {
QualityResult::Pass { kl_divergence } => {
assert!(kl_divergence < 0.001, "KL should be very small");
},
QualityResult::Fail { .. } => panic!("Expected Pass for small diff"),
}
}
#[test]
fn test_kl_divergence_very_different() {
let fp32 = vec![10.0, 0.0, 0.0, 0.0, 0.0];
let quant = vec![0.0, 0.0, 0.0, 0.0, 10.0];
let result = validate_quantization_quality(&fp32, &quant, 0.01);
match result {
QualityResult::Fail { kl_divergence, .. } => {
assert!(kl_divergence > 1.0, "KL should be large for opposite");
},
QualityResult::Pass { .. } => panic!("Expected Fail for very different"),
}
}
#[test]
fn test_kl_divergence_mismatched_lengths() {
let fp32 = vec![1.0, 2.0, 3.0];
let quant = vec![1.0, 2.0];
let result = validate_quantization_quality(&fp32, &quant, 0.01);
assert!(matches!(result, QualityResult::Fail { .. }));
}
#[test]
fn test_kl_divergence_empty() {
let result = validate_quantization_quality(&[], &[], 0.01);
assert!(matches!(result, QualityResult::Pass { .. }));
}
#[test]
fn test_benchmark_result_summary() {
let result = BenchmarkResult {
config: BenchmarkConfig {
model: "test".to_string(),
format: "apr".to_string(),
quantization: "q4_k".to_string(),
runtime: "realizar".to_string(),
runtime_version: "0.2.3".to_string(),
},
cold_start_ms: 100.0,
model_load_ms: 50.0,
ttft_ms: vec![20.0, 22.0, 21.0, 25.0, 23.0, 24.0, 22.0, 21.0, 20.0, 26.0],
itl_ms: vec![10.0, 11.0, 10.5, 11.5, 10.2, 10.8, 11.2, 10.3, 10.7, 11.0],
generation_tok_s: vec![140.0, 142.0, 141.0, 143.0, 139.0],
peak_memory_mb: 1024,
kv_cache_waste_pct: 3.5,
energy_joules: 50.0,
tokens_generated: 1000,
actual_iterations: 500,
cv_at_stop: 0.045,
timestamp: 12345,
};
let summary = result.summary();
assert!(summary.ttft_p50 > 20.0 && summary.ttft_p50 < 25.0);
assert!(summary.ttft_p99 >= summary.ttft_p50);
assert!(summary.ttft_p999 >= summary.ttft_p99);
assert!(summary.itl_median > 10.0 && summary.itl_median < 12.0);
assert!(summary.itl_std_dev < 2.0);
assert!(summary.throughput_median > 139.0 && summary.throughput_median < 144.0);
assert!((summary.token_joules - 0.05).abs() < 0.001);
assert_eq!(summary.iterations, 500);
assert!((summary.cv_final - 0.045).abs() < 0.001);
}
#[test]
fn test_percentile_calculation() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
assert!(percentile(&data, 50.0) >= 5.0 && percentile(&data, 50.0) <= 6.0);
assert!(percentile(&data, 90.0) >= 9.0);
assert_eq!(percentile(&data, 100.0), 10.0);
}
#[test]
fn test_bootstrap_ci() {
let data = vec![100.0; 100];
let (lower, upper) = bootstrap_ci(&data, 0.95, 1000);
assert!((lower - 100.0).abs() < 0.01);
assert!((upper - 100.0).abs() < 0.01);
}
#[test]
fn test_softmax_sums_to_one() {
let logits = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let probs = softmax(&logits);
let sum: f64 = probs.iter().sum();
assert!((sum - 1.0).abs() < 1e-10);
}
#[test]
fn test_softmax_monotonic() {
let logits = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let probs = softmax(&logits);
for i in 1..probs.len() {
assert!(probs[i] > probs[i - 1]);
}
}
#[test]
fn test_softmax_numerical_stability() {
let logits = vec![1000.0, 1001.0, 1002.0];
let probs = softmax(&logits);
let sum: f64 = probs.iter().sum();
assert!((sum - 1.0).abs() < 1e-10);
}
#[test]
fn test_workload_type_short_qa() {
let workload = WorkloadType::ShortQa;
assert_eq!(workload.input_tokens(), 32);
assert_eq!(workload.output_tokens(), 64);
}
#[test]
fn test_workload_type_long_context() {
let workload = WorkloadType::LongContext;
assert_eq!(workload.input_tokens(), 2048);
assert_eq!(workload.output_tokens(), 512);
}
#[test]
fn test_convoy_config_default() {
let config = ConvoyTestConfig::default();
assert_eq!(config.long_requests, 10);
assert_eq!(config.short_requests, 100);
assert!((config.max_p99_increase_pct - 50.0).abs() < 0.01);
assert!((config.max_hol_blocking_ms - 500.0).abs() < 0.01);
assert!((config.max_kv_fragmentation_pct - 15.0).abs() < 0.01);
}
#[test]
fn test_convoy_test_result_pass() {
let config = ConvoyTestConfig::default();
let baseline = vec![10.0, 12.0, 11.0, 13.0, 10.5]; let convoy = vec![12.0, 14.0, 13.0, 15.0, 12.5]; let hol = vec![50.0, 100.0, 75.0, 80.0, 60.0];
let kv_frag = 10.0;
let result = ConvoyTestResult::new(&config, &baseline, &convoy, &hol, kv_frag);
assert!(result.passed, "Should pass with acceptable metrics");
assert!(result.failure_reasons.is_empty());
assert!(result.p99_increase_pct < 50.0);
assert!(result.max_hol_blocking_ms < 500.0);
}
#[test]
fn test_convoy_test_result_fail_p99() {
let config = ConvoyTestConfig::default();
let baseline = vec![10.0; 100];
let convoy = vec![20.0; 100]; let hol = vec![50.0; 100];
let kv_frag = 5.0;
let result = ConvoyTestResult::new(&config, &baseline, &convoy, &hol, kv_frag);
assert!(!result.passed, "Should fail with 100% p99 increase");
assert!(result.failure_reasons.iter().any(|r| r.contains("P99")));
}
#[test]
fn test_convoy_test_result_fail_hol_blocking() {
let config = ConvoyTestConfig::default();
let baseline = vec![10.0; 100];
let convoy = vec![11.0; 100]; let hol = vec![600.0; 100]; let kv_frag = 5.0;
let result = ConvoyTestResult::new(&config, &baseline, &convoy, &hol, kv_frag);
assert!(!result.passed, "Should fail with HOL blocking > 500ms");
assert!(result.failure_reasons.iter().any(|r| r.contains("HOL")));
}
#[test]
fn test_convoy_test_result_fail_kv_fragmentation() {
let config = ConvoyTestConfig::default();
let baseline = vec![10.0; 100];
let convoy = vec![11.0; 100];
let hol = vec![50.0; 100];
let kv_frag = 20.0;
let result = ConvoyTestResult::new(&config, &baseline, &convoy, &hol, kv_frag);
assert!(!result.passed, "Should fail with KV fragmentation > 15%");
assert!(result.failure_reasons.iter().any(|r| r.contains("KV")));
}
#[test]
fn test_saturation_config_default() {
let config = SaturationTestConfig::default();
assert_eq!(config.cpu_load_pct, 50);
assert!((config.max_throughput_degradation_pct - 30.0).abs() < 0.01);
assert!((config.max_p99_increase_pct - 100.0).abs() < 0.01);
}
#[test]
fn test_saturation_test_result_pass() {
let config = SaturationTestConfig::default();
let baseline_throughput = vec![100.0, 102.0, 98.0, 101.0, 99.0];
let stressed_throughput = vec![85.0, 87.0, 83.0, 86.0, 84.0]; let baseline_latency = vec![10.0, 12.0, 11.0, 10.5, 11.5];
let stressed_latency = vec![15.0, 17.0, 16.0, 15.5, 16.5];
let result = SaturationTestResult::new(
&config,
&baseline_throughput,
&stressed_throughput,
&baseline_latency,
&stressed_latency,
);
assert!(result.passed, "Should pass with acceptable degradation");
assert!(result.throughput_degradation_pct < 30.0);
assert!(result.p99_increase_pct < 100.0);
}
#[test]
fn test_saturation_test_result_fail_throughput() {
let config = SaturationTestConfig::default();
let baseline_throughput = vec![100.0; 100];
let stressed_throughput = vec![50.0; 100]; let baseline_latency = vec![10.0; 100];
let stressed_latency = vec![15.0; 100];
let result = SaturationTestResult::new(
&config,
&baseline_throughput,
&stressed_throughput,
&baseline_latency,
&stressed_latency,
);
assert!(
!result.passed,
"Should fail with 50% throughput degradation"
);
assert!(result
.failure_reasons
.iter()
.any(|r| r.contains("Throughput")));
}
#[test]
fn test_saturation_test_result_fail_p99() {
let config = SaturationTestConfig::default();
let baseline_throughput = vec![100.0; 100];
let stressed_throughput = vec![90.0; 100]; let baseline_latency = vec![10.0; 100];
let stressed_latency = vec![25.0; 100];
let result = SaturationTestResult::new(
&config,
&baseline_throughput,
&stressed_throughput,
&baseline_latency,
&stressed_latency,
);
assert!(!result.passed, "Should fail with 150% p99 increase");
assert!(result.failure_reasons.iter().any(|r| r.contains("P99")));
}
#[test]
fn test_hardware_spec_default() {
let spec = HardwareSpec::default();
assert_eq!(spec.cpu, "Unknown");
assert!(spec.gpu.is_none());
assert_eq!(spec.memory_gb, 0);
assert_eq!(spec.storage, "Unknown");
}
#[test]
fn test_sampling_config_default() {
let config = SamplingConfig::default();
assert_eq!(config.method, "dynamic_cv");
assert!((config.cv_threshold - 0.05).abs() < 0.001);
assert_eq!(config.warmup_iterations, 100);
}
#[test]
fn test_thermal_info_default() {
let info = ThermalInfo::default();
assert!(info.valid);
assert!((info.temp_variance_c - 0.0).abs() < 0.001);
assert!((info.max_temp_c - 0.0).abs() < 0.001);
}
#[test]
fn test_full_benchmark_result_from_benchmark_result() {
let result = BenchmarkResult {
config: BenchmarkConfig {
model: "test".to_string(),
format: "apr".to_string(),
quantization: "q4_k".to_string(),
runtime: "realizar".to_string(),
runtime_version: "0.2.3".to_string(),
},
cold_start_ms: 100.0,
model_load_ms: 50.0,
ttft_ms: vec![20.0, 22.0, 21.0, 25.0, 23.0],
itl_ms: vec![10.0, 11.0, 10.5, 11.5, 10.2],
generation_tok_s: vec![140.0, 142.0, 141.0],
peak_memory_mb: 1024,
kv_cache_waste_pct: 3.5,
energy_joules: 50.0,
tokens_generated: 1000,
actual_iterations: 500,
cv_at_stop: 0.045,
timestamp: 12345,
};
let hardware = HardwareSpec {
cpu: "Apple M3 Max".to_string(),
gpu: Some("Apple M3 Max (40 cores)".to_string()),
memory_gb: 128,
storage: "NVMe".to_string(),
};
let temps = vec![72.0, 73.0, 72.5, 73.5, 72.0];
let kl_div = 0.031;
let full_result =
FullBenchmarkResult::from_benchmark_result(&result, hardware, &temps, kl_div);
assert_eq!(full_result.version, "1.1");
assert!(full_result.timestamp.contains("1970")); assert_eq!(full_result.config.model, "test");
assert_eq!(full_result.hardware.cpu, "Apple M3 Max");
assert_eq!(full_result.sampling.actual_iterations, 500);
assert!(full_result.thermal.valid);
assert!((full_result.quality.kl_divergence_vs_fp32 - 0.031).abs() < 0.001);
}
#[test]
fn test_full_benchmark_result_json_roundtrip() {
let result = BenchmarkResult {
config: BenchmarkConfig {
model: "test".to_string(),
format: "apr".to_string(),
quantization: "q4_k".to_string(),
runtime: "realizar".to_string(),
runtime_version: "0.2.3".to_string(),
},
cold_start_ms: 100.0,
model_load_ms: 50.0,
ttft_ms: vec![20.0, 22.0, 21.0],
itl_ms: vec![10.0, 11.0, 10.5],
generation_tok_s: vec![140.0, 142.0],
peak_memory_mb: 1024,
kv_cache_waste_pct: 3.5,
energy_joules: 50.0,
tokens_generated: 1000,
actual_iterations: 500,
cv_at_stop: 0.045,
timestamp: 12345,
};
let full_result =
FullBenchmarkResult::from_benchmark_result(&result, HardwareSpec::default(), &[], 0.0);
let json = full_result.to_json().expect("Should serialize");
let parsed: FullBenchmarkResult =
FullBenchmarkResult::from_json(&json).expect("Should parse");
assert_eq!(parsed.version, "1.1");
assert_eq!(parsed.config.model, "test");
assert_eq!(parsed.sampling.actual_iterations, 500);
}
#[test]
fn test_benchmark_comparison_realizar_wins() {
let baseline = create_test_full_result("llama.cpp", 40.0, 100.0, 1500, 0.06);
let current = create_test_full_result("realizar", 30.0, 140.0, 1200, 0.04);
let comparison = BenchmarkComparison::compare(&baseline, ¤t);
assert_eq!(comparison.winner, "realizar");
assert!(comparison.ttft_p99_change_pct < 0.0); assert!(comparison.throughput_change_pct > 0.0); assert!(comparison.memory_change_pct < 0.0); assert!(comparison.energy_change_pct < 0.0); }
#[test]
fn test_benchmark_comparison_tie() {
let baseline = create_test_full_result("runtime_a", 30.0, 140.0, 1200, 0.04);
let current = create_test_full_result("runtime_b", 30.0, 140.0, 1200, 0.04);
let comparison = BenchmarkComparison::compare(&baseline, ¤t);
assert_eq!(comparison.winner, "tie");
}
#[test]
fn test_regression_result_no_regression() {
let baseline = create_test_full_result("realizar", 30.0, 140.0, 1200, 0.04);
let current = create_test_full_result("realizar", 29.0, 145.0, 1150, 0.038);
let regression = RegressionResult::check(&baseline, ¤t, 5.0);
assert!(!regression.regression_detected);
assert!(regression.regressed_metrics.is_empty());
}
#[test]
fn test_regression_result_ttft_regression() {
let baseline = create_test_full_result("realizar", 30.0, 140.0, 1200, 0.04);
let current = create_test_full_result("realizar", 35.0, 140.0, 1200, 0.04);
let regression = RegressionResult::check(&baseline, ¤t, 5.0);
assert!(regression.regression_detected);
assert!(regression
.regressed_metrics
.iter()
.any(|m| m.contains("ttft")));
}
#[test]
fn test_regression_result_throughput_regression() {
let baseline = create_test_full_result("realizar", 30.0, 140.0, 1200, 0.04);
let current = create_test_full_result("realizar", 30.0, 120.0, 1200, 0.04);
let regression = RegressionResult::check(&baseline, ¤t, 5.0);
assert!(regression.regression_detected);
assert!(regression
.regressed_metrics
.iter()
.any(|m| m.contains("throughput")));
}
#[test]
fn test_regression_result_memory_regression() {
let baseline = create_test_full_result("realizar", 30.0, 140.0, 1200, 0.04);
let current = create_test_full_result("realizar", 30.0, 140.0, 1400, 0.04);
let regression = RegressionResult::check(&baseline, ¤t, 5.0);
assert!(regression.regression_detected);
assert!(regression
.regressed_metrics
.iter()
.any(|m| m.contains("memory")));
}
fn create_test_full_result(
runtime: &str,
ttft_p99: f64,
throughput: f64,
memory_mb: u64,
token_joules: f64,
) -> FullBenchmarkResult {
FullBenchmarkResult {
version: "1.1".to_string(),
timestamp: "2025-12-09T12:00:00Z".to_string(),
config: BenchmarkConfig {
model: "test".to_string(),
format: "apr".to_string(),
quantization: "q4_k".to_string(),
runtime: runtime.to_string(),
runtime_version: "1.0.0".to_string(),
},
hardware: HardwareSpec::default(),
sampling: SamplingConfig::default(),
thermal: ThermalInfo::default(),
results: BenchmarkResults {
ttft_ms: TtftResults {
p50: ttft_p99 * 0.7,
p95: ttft_p99 * 0.9,
p99: ttft_p99,
p999: ttft_p99 * 1.2,
},
itl_ms: ItlResults {
median: 10.0,
std_dev: 2.0,
p99: 15.0,
},
throughput_tok_s: ThroughputResults {
median: throughput,
ci_95: (throughput * 0.95, throughput * 1.05),
},
memory_mb: MemoryResults {
model_mb: memory_mb / 2,
peak_rss_mb: memory_mb,
kv_waste_pct: 3.0,
},
energy: EnergyResults {
total_joules: 50.0,
token_joules,
idle_watts: 8.0,
},
cold_start_ms: ColdStartResults {
median: 100.0,
p99: 150.0,
},
},
quality: QualityValidation {
kl_divergence_vs_fp32: 0.03,
perplexity_wikitext2: Some(5.89),
},
}
}
#[test]
fn test_dynamic_sampler_current_cv_empty() {
let sampler = DynamicSampler::default();
let cv = sampler.current_cv(&[]);
assert!(cv.is_infinite());
}
#[test]
fn test_dynamic_sampler_current_cv_single_value() {
let sampler = DynamicSampler::default();
let cv = sampler.current_cv(&[100.0]);
assert!(cv.is_infinite());
}
#[test]
fn test_dynamic_sampler_current_cv_constant_values() {
let sampler = DynamicSampler::default();
let data: Vec<f64> = vec![50.0; 100];
let cv = sampler.current_cv(&data);
assert!(cv.abs() < 1e-10, "CV of constant should be ~0");
}
#[test]
fn test_dynamic_sampler_current_cv_varied_window() {
let sampler = DynamicSampler {
cv_window: 10,
..Default::default()
};
let data: Vec<f64> = (0..100).map(|i| 100.0 + (i as f64 % 10.0)).collect();
let cv = sampler.current_cv(&data);
assert!(cv > 0.0 && cv < 1.0);
}
#[test]
fn test_dynamic_sampler_current_cv_small_window() {
let sampler = DynamicSampler {
cv_window: 5,
..Default::default()
};
let data = vec![10.0, 20.0, 30.0, 40.0, 50.0];
let cv = sampler.current_cv(&data);
assert!(cv > 0.4 && cv < 0.6);
}
#[test]
fn test_dynamic_sampler_default_values() {
let sampler = DynamicSampler::default();
assert_eq!(sampler.min_samples, 100);
assert_eq!(sampler.max_samples, 10_000);
assert!((sampler.cv_threshold - 0.05).abs() < 0.001);
assert_eq!(sampler.cv_window, 50);
assert_eq!(sampler.stability_count, 3);
}
#[test]
fn test_thermal_guard_temp_variance_empty() {
let guard = ThermalGuard::default();
let variance = guard.temp_variance(&[]);
assert!((variance - 0.0).abs() < 0.001);
}
#[test]
fn test_thermal_guard_temp_variance_single() {
let guard = ThermalGuard::default();
let variance = guard.temp_variance(&[75.0]);
assert!((variance - 0.0).abs() < 0.001);
}
#[test]
fn test_thermal_guard_temp_variance_constant() {
let guard = ThermalGuard::default();
let temps = vec![72.0; 100];
let variance = guard.temp_variance(&temps);
assert!(variance < 0.001);
}
#[test]
fn test_thermal_guard_temp_variance_varied() {
let guard = ThermalGuard::default();
let temps = vec![70.0, 72.0, 74.0, 76.0, 78.0];
let variance = guard.temp_variance(&temps);
assert!(variance > 2.0 && variance < 4.0);
}
#[test]
fn test_thermal_guard_max_temp_empty() {
let guard = ThermalGuard::default();
assert_eq!(guard.max_temp(&[]), 0.0);
}
#[test]
fn test_thermal_guard_max_temp_single() {
let guard = ThermalGuard::default();
assert_eq!(guard.max_temp(&[82.5]), 82.5);
}
#[test]
fn test_thermal_guard_cooldown_not_needed() {
let guard = ThermalGuard::default();
guard.cooldown_if_needed(70.0);
}
#[test]
fn test_chrono_timestamp_format() {
let ts = chrono_timestamp();
assert!(ts.contains("1970"));
assert!(ts.contains("T"));
assert!(ts.contains("Z"));
assert!(ts.contains("+"));
assert!(ts.contains("s"));
}
#[test]
fn test_bootstrap_ci_empty() {
let (lower, upper) = bootstrap_ci(&[], 0.95, 1000);
assert_eq!(lower, 0.0);
assert_eq!(upper, 0.0);
}
#[test]
fn test_bootstrap_ci_single_value() {
let (lower, upper) = bootstrap_ci(&[42.0], 0.95, 1000);
assert!((lower - 42.0).abs() < 0.01);
assert!((upper - 42.0).abs() < 0.01);
}
#[test]
fn test_bootstrap_ci_varied_data() {
let data: Vec<f64> = (1..=100).map(|i| i as f64).collect();
let (lower, upper) = bootstrap_ci(&data, 0.95, 1000);
assert!(lower < 55.0);
assert!(upper > 45.0);
assert!(lower < upper);
}
#[test]
fn test_bootstrap_ci_narrow_confidence() {
let data = vec![100.0; 50];
let (lower, upper) = bootstrap_ci(&data, 0.50, 100);
assert!((lower - 100.0).abs() < 0.1);
assert!((upper - 100.0).abs() < 0.1);
}
#[test]
fn test_percentile_empty() {
assert_eq!(percentile(&[], 50.0), 0.0);
}
#[test]
fn test_percentile_single() {
assert_eq!(percentile(&[42.0], 50.0), 42.0);
assert_eq!(percentile(&[42.0], 99.0), 42.0);
}
#[test]
fn test_compute_std_dev_constant() {
let data = vec![100.0; 50];
let std_dev = compute_std_dev(&data);
assert!(std_dev < 0.001);
}
#[test]
fn test_compute_std_dev_empty() {
let std_dev = compute_std_dev(&[]);
assert_eq!(std_dev, 0.0);
}
#[test]
fn test_compute_variance_empty() {
assert_eq!(compute_variance(&[]), 0.0);
}
#[test]
fn test_compute_variance_single() {
assert_eq!(compute_variance(&[100.0]), 0.0);
}
#[test]
fn test_compute_cv_single_value() {
let cv = compute_cv(&[100.0]);
assert!(cv.is_infinite());
}
#[test]
fn test_compute_cv_zero_mean() {
let data = vec![-1.0, 1.0, -1.0, 1.0];
let cv = compute_cv(&data);
assert!(cv.is_infinite());
}
#[test]
fn test_energy_metrics_tokens_per_joule_zero_joules() {
let metrics = EnergyMetrics::new(0.0, 10.0, 50.0, 1000);
assert_eq!(metrics.tokens_per_joule(), 0.0);
}
#[test]
fn test_energy_metrics_very_small_joules() {
let metrics = EnergyMetrics::new(1e-15, 10.0, 50.0, 1000);
assert_eq!(metrics.tokens_per_joule(), 0.0);
}
#[test]
fn test_itl_metrics_single_value() {
let metrics = ItlMetrics::from_measurements(&[15.0]);
assert_eq!(metrics.median_ms, 15.0);
assert_eq!(metrics.p99_ms, 15.0);
assert_eq!(metrics.p999_ms, 15.0);
assert_eq!(metrics.std_dev_ms, 0.0);
}
#[test]
fn test_itl_metrics_two_values() {
let metrics = ItlMetrics::from_measurements(&[10.0, 20.0]);
assert_eq!(metrics.median_ms, 15.0);
assert!(metrics.std_dev_ms > 0.0);
}
#[test]
fn test_convoy_test_result_empty_hol() {
let config = ConvoyTestConfig::default();
let baseline = vec![10.0; 10];
let convoy = vec![11.0; 10];
let hol: Vec<f64> = vec![];
let kv_frag = 5.0;
let result = ConvoyTestResult::new(&config, &baseline, &convoy, &hol, kv_frag);
assert_eq!(result.avg_hol_blocking_ms, 0.0);
assert_eq!(result.max_hol_blocking_ms, 0.0);
}
#[test]
fn test_convoy_test_result_zero_baseline() {
let config = ConvoyTestConfig::default();
let baseline = vec![0.0; 10];
let convoy = vec![10.0; 10];
let hol = vec![50.0; 10];
let kv_frag = 5.0;
let result = ConvoyTestResult::new(&config, &baseline, &convoy, &hol, kv_frag);
assert_eq!(result.p99_increase_pct, 0.0);
}
#[test]
fn test_saturation_test_result_empty_data() {
let config = SaturationTestConfig::default();
let result = SaturationTestResult::new(&config, &[], &[], &[], &[]);
assert_eq!(result.baseline_throughput, 0.0);
assert_eq!(result.stressed_throughput, 0.0);
assert_eq!(result.throughput_degradation_pct, 0.0);
}
#[test]
fn test_saturation_test_result_zero_baseline() {
let config = SaturationTestConfig::default();
let result =
SaturationTestResult::new(&config, &[0.0; 10], &[50.0; 10], &[0.0; 10], &[10.0; 10]);
assert_eq!(result.throughput_degradation_pct, 0.0);
assert_eq!(result.p99_increase_pct, 0.0);
}
#[test]
fn test_benchmark_comparison_zero_baselines() {
let baseline = create_test_full_result("baseline", 0.0, 0.0, 0, 0.0);
let current = create_test_full_result("current", 30.0, 140.0, 1200, 0.04);
let comparison = BenchmarkComparison::compare(&baseline, ¤t);
assert_eq!(comparison.ttft_p99_change_pct, 0.0);
assert_eq!(comparison.throughput_change_pct, 0.0);
assert_eq!(comparison.memory_change_pct, 0.0);
assert_eq!(comparison.energy_change_pct, 0.0);
}
#[test]
fn test_regression_result_zero_baselines() {
let baseline = create_test_full_result("test", 0.0, 0.0, 0, 0.0);
let current = create_test_full_result("test", 30.0, 140.0, 1200, 0.04);
let regression = RegressionResult::check(&baseline, ¤t, 5.0);
assert!(!regression.regression_detected);
}
#[test]
fn test_benchmark_result_zero_tokens() {
let result = BenchmarkResult {
config: BenchmarkConfig {
model: "test".to_string(),
format: "apr".to_string(),
quantization: "q4_k".to_string(),
runtime: "realizar".to_string(),
runtime_version: "0.2.3".to_string(),
},
cold_start_ms: 100.0,
model_load_ms: 50.0,
ttft_ms: vec![20.0],
itl_ms: vec![10.0],
generation_tok_s: vec![140.0],
peak_memory_mb: 1024,
kv_cache_waste_pct: 3.5,
energy_joules: 50.0,
tokens_generated: 0,
actual_iterations: 100,
cv_at_stop: 0.04,
timestamp: 12345,
};
let summary = result.summary();
assert_eq!(summary.token_joules, 0.0);
}
#[test]
fn test_kv_cache_used_more_than_allocated() {
let metrics = KvCacheMetrics::new(1000, 1500);
assert_eq!(metrics.fragmentation_pct, 0.0);
}
#[test]
fn test_softmax_single_value() {
let probs = softmax(&[5.0]);
assert_eq!(probs.len(), 1);
assert!((probs[0] - 1.0).abs() < 1e-10);
}
#[test]
fn test_softmax_negative_values() {
let logits = vec![-5.0, -3.0, -1.0, 0.0, 1.0];
let probs = softmax(&logits);
let sum: f64 = probs.iter().sum();
assert!((sum - 1.0).abs() < 1e-10);
for i in 1..probs.len() {
assert!(probs[i] > probs[i - 1]);
}
}
#[test]
fn test_full_benchmark_result_invalid_thermal() {
let result = BenchmarkResult {
config: BenchmarkConfig {
model: "test".to_string(),
format: "apr".to_string(),
quantization: "q4_k".to_string(),
runtime: "realizar".to_string(),
runtime_version: "0.2.3".to_string(),
},
cold_start_ms: 100.0,
model_load_ms: 50.0,
ttft_ms: vec![20.0],
itl_ms: vec![10.0],
generation_tok_s: vec![140.0],
peak_memory_mb: 1024,
kv_cache_waste_pct: 3.5,
energy_joules: 50.0,
tokens_generated: 1000,
actual_iterations: 100,
cv_at_stop: 0.04,
timestamp: 12345,
};
let temps = vec![60.0, 70.0, 80.0, 65.0, 85.0];
let full_result = FullBenchmarkResult::from_benchmark_result(
&result,
HardwareSpec::default(),
&temps,
0.03,
);
assert!(!full_result.thermal.valid);
assert!(full_result.thermal.temp_variance_c > 2.0);
}
#[test]
fn test_benchmark_comparison_baseline_wins() {
let baseline = create_test_full_result("baseline", 25.0, 160.0, 1000, 0.03);
let current = create_test_full_result("current", 40.0, 100.0, 1500, 0.06);
let comparison = BenchmarkComparison::compare(&baseline, ¤t);
assert_eq!(comparison.winner, "baseline");
}
#[test]
fn test_thermal_validity_debug() {
let valid = ThermalValidity::Valid;
let invalid = ThermalValidity::Invalid("test".to_string());
assert!(format!("{valid:?}").contains("Valid"));
assert!(format!("{invalid:?}").contains("Invalid"));
}
#[test]
fn test_quality_result_debug() {
let pass = QualityResult::Pass {
kl_divergence: 0.01,
};
let fail = QualityResult::Fail {
kl_divergence: 0.5,
threshold: 0.1,
message: "test",
};
assert!(format!("{pass:?}").contains("Pass"));
assert!(format!("{fail:?}").contains("Fail"));
}
#[test]
fn test_workload_type_equality() {
assert_eq!(WorkloadType::ShortQa, WorkloadType::ShortQa);
assert_eq!(WorkloadType::LongContext, WorkloadType::LongContext);
assert_ne!(WorkloadType::ShortQa, WorkloadType::LongContext);
}
#[test]
fn test_workload_type_copy() {
let wt = WorkloadType::ShortQa;
let wt_copy = wt;
assert_eq!(wt, wt_copy);
}
#[test]
fn test_runtime_type_display() {
assert_eq!(RuntimeType::Realizar.as_str(), "realizar");
assert_eq!(RuntimeType::LlamaCpp.as_str(), "llama-cpp");
assert_eq!(RuntimeType::Vllm.as_str(), "vllm");
assert_eq!(RuntimeType::Ollama.as_str(), "ollama");
}
#[test]
fn test_runtime_type_from_str() {
assert_eq!(RuntimeType::parse("realizar"), Some(RuntimeType::Realizar));
assert_eq!(RuntimeType::parse("llama-cpp"), Some(RuntimeType::LlamaCpp));
assert_eq!(RuntimeType::parse("llama.cpp"), Some(RuntimeType::LlamaCpp));
assert_eq!(RuntimeType::parse("vllm"), Some(RuntimeType::Vllm));
assert_eq!(RuntimeType::parse("ollama"), Some(RuntimeType::Ollama));
assert_eq!(RuntimeType::parse("unknown"), None);
}
#[test]
fn test_inference_request_default() {
let req = InferenceRequest::default();
assert_eq!(req.prompt, "");
assert_eq!(req.max_tokens, 100);
assert!(req.temperature > 0.0);
}
#[test]
fn test_inference_request_builder() {
let req = InferenceRequest::new("Hello, world!")
.with_max_tokens(50)
.with_temperature(0.5);
assert_eq!(req.prompt, "Hello, world!");
assert_eq!(req.max_tokens, 50);
assert!((req.temperature - 0.5).abs() < 0.001);
}
#[test]
fn test_inference_response_tokens_per_second() {
let response = InferenceResponse {
text: "Hello".to_string(),
tokens_generated: 100,
ttft_ms: 50.0,
total_time_ms: 1000.0,
itl_ms: vec![10.0, 10.0, 10.0],
};
assert!((response.tokens_per_second() - 100.0).abs() < 0.1);
}
#[test]
fn test_inference_response_tokens_per_second_zero_time() {
let response = InferenceResponse {
text: String::new(),
tokens_generated: 100,
ttft_ms: 0.0,
total_time_ms: 0.0,
itl_ms: vec![],
};
assert_eq!(response.tokens_per_second(), 0.0);
}
#[test]
fn test_mock_backend_inference() {
let backend = MockBackend::new(42.0, 150.0);
let req = InferenceRequest::new("test prompt");
let response = backend.inference(&req);
assert!(response.is_ok());
let resp = response.unwrap();
assert!((resp.ttft_ms - 42.0).abs() < 0.001);
assert!(resp.tokens_generated > 0);
}
#[test]
fn test_mock_backend_info() {
let backend = MockBackend::new(30.0, 140.0);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::Realizar);
assert!(!info.version.is_empty());
assert!(info.supports_streaming);
}
#[test]
fn test_backend_registry_default() {
let registry = BackendRegistry::new();
assert!(registry.get(RuntimeType::Realizar).is_none());
}
#[test]
fn test_backend_registry_register_and_get() {
let mut registry = BackendRegistry::new();
let backend = Box::new(MockBackend::new(30.0, 140.0));
registry.register(RuntimeType::Realizar, backend);
assert!(registry.get(RuntimeType::Realizar).is_some());
assert!(registry.get(RuntimeType::LlamaCpp).is_none());
}
#[test]
fn test_backend_registry_list() {
let mut registry = BackendRegistry::new();
registry.register(
RuntimeType::Realizar,
Box::new(MockBackend::new(30.0, 140.0)),
);
registry.register(
RuntimeType::LlamaCpp,
Box::new(MockBackend::new(35.0, 130.0)),
);
let list = registry.list();
assert_eq!(list.len(), 2);
assert!(list.contains(&RuntimeType::Realizar));
assert!(list.contains(&RuntimeType::LlamaCpp));
}
#[test]
fn test_llama_cpp_config_default() {
let config = LlamaCppConfig::default();
assert_eq!(config.binary_path, "llama-cli");
assert_eq!(config.n_gpu_layers, 0);
assert_eq!(config.ctx_size, 2048);
}
#[test]
fn test_llama_cpp_config_builder() {
let config = LlamaCppConfig::new("/usr/bin/llama-cli")
.with_model("/models/test.gguf")
.with_gpu_layers(32)
.with_ctx_size(4096);
assert_eq!(config.binary_path, "/usr/bin/llama-cli");
assert_eq!(config.model_path, Some("/models/test.gguf".to_string()));
assert_eq!(config.n_gpu_layers, 32);
assert_eq!(config.ctx_size, 4096);
}
#[test]
fn test_vllm_config_default() {
let config = VllmConfig::default();
assert_eq!(config.base_url, "http://localhost:8000");
assert_eq!(config.api_version, "v1");
}
#[test]
fn test_vllm_config_builder() {
let config = VllmConfig::new("http://gpu-server:8080")
.with_model("meta-llama/Llama-2-7b")
.with_api_key("test-key");
assert_eq!(config.base_url, "http://gpu-server:8080");
assert_eq!(config.model, Some("meta-llama/Llama-2-7b".to_string()));
assert_eq!(config.api_key, Some("test-key".to_string()));
}
#[test]
fn test_llama_cpp_backend_creation() {
let config = LlamaCppConfig::new("llama-cli");
let backend = LlamaCppBackend::new(config);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::LlamaCpp);
assert!(!info.version.is_empty());
}
#[test]
fn test_llama_cpp_backend_info() {
let config = LlamaCppConfig::new("llama-cli").with_model("test.gguf");
let backend = LlamaCppBackend::new(config);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::LlamaCpp);
assert!(!info.supports_streaming); }
#[test]
fn test_llama_cpp_backend_missing_binary() {
let config = LlamaCppConfig::new("/nonexistent/llama-cli");
let backend = LlamaCppBackend::new(config);
let request = InferenceRequest::new("test");
let result = backend.inference(&request);
assert!(result.is_err());
}
#[cfg(feature = "bench-http")]
#[test]
fn test_vllm_backend_creation() {
let config = VllmConfig::new("http://localhost:8000");
let backend = VllmBackend::new(config);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::Vllm);
}
#[cfg(feature = "bench-http")]
#[test]
fn test_vllm_backend_info() {
let config = VllmConfig::new("http://localhost:8000").with_model("meta-llama/Llama-2-7b");
let backend = VllmBackend::new(config);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::Vllm);
assert!(info.supports_streaming); }
#[cfg(feature = "bench-http")]
#[test]
fn test_vllm_backend_connection_error() {
let config = VllmConfig::new("http://localhost:99999"); let backend = VllmBackend::new(config);
let request = InferenceRequest::new("test");
let result = backend.inference(&request);
assert!(result.is_err());
}
#[test]
fn test_measurement_protocol_default() {
let protocol = MeasurementProtocol::default();
assert_eq!(protocol.latency_samples, 100);
assert_eq!(
protocol.latency_percentiles,
vec![50.0, 90.0, 95.0, 99.0, 99.9]
);
assert_eq!(protocol.throughput_duration.as_secs(), 60);
assert_eq!(protocol.throughput_ramp_up.as_secs(), 10);
assert_eq!(protocol.memory_samples, 10);
}
#[test]
fn test_measurement_protocol_builder() {
let protocol = MeasurementProtocol::new()
.with_latency_samples(200)
.with_percentiles(vec![50.0, 95.0, 99.0])
.with_throughput_duration(Duration::from_secs(120))
.with_memory_samples(20);
assert_eq!(protocol.latency_samples, 200);
assert_eq!(protocol.latency_percentiles, vec![50.0, 95.0, 99.0]);
assert_eq!(protocol.throughput_duration.as_secs(), 120);
assert_eq!(protocol.memory_samples, 20);
}
#[test]
fn test_latency_statistics_from_samples() {
let samples = vec![
Duration::from_millis(10),
Duration::from_millis(20),
Duration::from_millis(30),
Duration::from_millis(40),
Duration::from_millis(50),
];
let stats = LatencyStatistics::from_samples(&samples);
assert_eq!(stats.samples, 5);
assert_eq!(stats.min, Duration::from_millis(10));
assert_eq!(stats.max, Duration::from_millis(50));
assert_eq!(stats.mean, Duration::from_millis(30));
}
#[test]
fn test_latency_statistics_percentiles() {
let samples: Vec<Duration> = (1..=100).map(Duration::from_millis).collect();
let stats = LatencyStatistics::from_samples(&samples);
assert!(stats.p50 >= Duration::from_millis(49));
assert!(stats.p50 <= Duration::from_millis(51));
assert!(stats.p95 >= Duration::from_millis(94));
assert!(stats.p95 <= Duration::from_millis(96));
assert!(stats.p99 >= Duration::from_millis(98));
assert!(stats.p99 <= Duration::from_millis(100));
}
#[test]
fn test_latency_statistics_confidence_interval() {
let samples: Vec<Duration> = (1..=100).map(Duration::from_millis).collect();
let stats = LatencyStatistics::from_samples(&samples);
let (lower, upper) = stats.confidence_interval_95;
assert!(lower < stats.mean);
assert!(upper > stats.mean);
}
#[test]
fn test_latency_statistics_std_dev() {
let samples: Vec<Duration> = (1..=10).map(|i| Duration::from_millis(i * 10)).collect();
let stats = LatencyStatistics::from_samples(&samples);
assert!(stats.std_dev > Duration::ZERO);
}
#[test]
fn test_outlier_detector_no_outliers() {
let samples = vec![10.0, 11.0, 10.5, 9.5, 10.2, 9.8, 10.1, 10.3];
let outliers = detect_outliers(&samples, 3.5); assert!(outliers.is_empty());
}
#[test]
fn test_outlier_detector_single_outlier() {
let samples = vec![10.0, 11.0, 10.5, 9.5, 10.2, 9.8, 10.1, 10.3, 100.0];
let outliers = detect_outliers(&samples, 3.5);
assert_eq!(outliers.len(), 1);
assert_eq!(outliers[0], 8);
}
#[test]
fn test_outlier_detector_multiple_outliers() {
let samples = vec![0.1, 10.0, 11.0, 10.5, 9.5, 10.2, 9.8, 10.1, 100.0];
let outliers = detect_outliers(&samples, 3.5);
assert_eq!(outliers.len(), 2);
assert!(outliers.contains(&0)); assert!(outliers.contains(&8)); }
#[test]
fn test_outlier_detector_threshold_sensitivity() {
let samples = vec![10.0, 11.0, 10.5, 9.5, 10.2, 9.8, 10.1, 15.0];
let strict_outliers = detect_outliers(&samples, 2.0);
let lenient_outliers = detect_outliers(&samples, 5.0);
assert!(strict_outliers.len() >= lenient_outliers.len());
}
#[test]
fn test_regression_detector_default() {
let detector = RegressionDetector::default();
assert_eq!(detector.warning_threshold, 0.02); assert_eq!(detector.failure_threshold, 0.05); }
#[test]
fn test_regression_detector_no_regression() {
let baseline = BenchmarkMetrics {
name: "latency".to_string(),
mean: 100.0,
std_dev: 5.0,
samples: 100,
};
let current = BenchmarkMetrics {
name: "latency".to_string(),
mean: 101.0, std_dev: 5.0,
samples: 100,
};
let detector = RegressionDetector::default();
let report = detector.compare(&baseline, ¤t);
assert!(report.passed);
assert!(report.regressions.is_empty());
}
#[test]
fn test_regression_detector_warning() {
let baseline = BenchmarkMetrics {
name: "latency".to_string(),
mean: 100.0,
std_dev: 5.0,
samples: 100,
};
let current = BenchmarkMetrics {
name: "latency".to_string(),
mean: 103.0, std_dev: 5.0,
samples: 100,
};
let detector = RegressionDetector::default();
let report = detector.compare(&baseline, ¤t);
assert!(report.passed); assert_eq!(report.warnings.len(), 1);
}
#[test]
fn test_regression_detector_failure() {
let baseline = BenchmarkMetrics {
name: "latency".to_string(),
mean: 100.0,
std_dev: 5.0,
samples: 100,
};
let current = BenchmarkMetrics {
name: "latency".to_string(),
mean: 110.0, std_dev: 5.0,
samples: 100,
};
let detector = RegressionDetector::default();
let report = detector.compare(&baseline, ¤t);
assert!(!report.passed);
assert_eq!(report.regressions.len(), 1);
}
#[test]
fn test_regression_detector_improvement() {
let baseline = BenchmarkMetrics {
name: "latency".to_string(),
mean: 100.0,
std_dev: 5.0,
samples: 100,
};
let current = BenchmarkMetrics {
name: "latency".to_string(),
mean: 90.0, std_dev: 5.0,
samples: 100,
};
let detector = RegressionDetector::default();
let report = detector.compare(&baseline, ¤t);
assert!(report.passed);
assert_eq!(report.improvements.len(), 1);
}
#[test]
fn test_welch_t_test_result_fields() {
let sample_a = vec![10.0, 11.0, 10.5, 10.2, 10.8];
let sample_b = vec![20.0, 21.0, 20.5, 20.2, 20.8];
let result = welch_t_test(&sample_a, &sample_b, 0.05);
assert!(result.t_statistic.is_finite());
assert!(result.degrees_of_freedom > 0.0);
assert!(result.p_value >= 0.0 && result.p_value <= 1.0);
assert!(result.significant);
}
#[test]
fn test_welch_t_test_identical_samples() {
let sample_a = vec![10.0, 10.0, 10.0, 10.0, 10.0];
let sample_b = vec![10.0, 10.0, 10.0, 10.0, 10.0];
let result = welch_t_test(&sample_a, &sample_b, 0.05);
assert!(!result.significant);
assert!(result.t_statistic.abs() < 1e-10 || result.p_value > 0.05);
}
#[test]
fn test_welch_t_test_clearly_different() {
let sample_a = vec![10.0, 11.0, 10.5, 10.2, 10.8, 10.3, 10.7, 10.1];
let sample_b = vec![50.0, 51.0, 50.5, 50.2, 50.8, 50.3, 50.7, 50.1];
let result = welch_t_test(&sample_a, &sample_b, 0.05);
assert!(result.significant);
assert!(result.p_value < 0.001); }
#[test]
fn test_welch_t_test_unequal_variance() {
let sample_a = vec![10.0, 10.1, 10.0, 10.1, 10.0]; let sample_b = vec![10.0, 15.0, 5.0, 20.0, 0.0]; let result = welch_t_test(&sample_a, &sample_b, 0.05);
assert!(!result.significant);
}
#[test]
fn test_welch_t_test_small_samples() {
let sample_a = vec![10.0, 11.0, 12.0];
let sample_b = vec![12.0, 13.0, 14.0];
let result = welch_t_test(&sample_a, &sample_b, 0.05);
assert!(result.degrees_of_freedom > 0.0);
}
#[test]
fn test_welch_t_test_alpha_levels() {
let sample_a = vec![10.0, 11.0, 10.5, 10.2, 10.8];
let sample_b = vec![11.0, 12.0, 11.5, 11.2, 11.8];
let result_strict = welch_t_test(&sample_a, &sample_b, 0.01);
let result_lenient = welch_t_test(&sample_a, &sample_b, 0.10);
if result_strict.significant {
assert!(result_lenient.significant);
}
}
#[test]
fn test_thermal_guard_struct_fields() {
let guard = ThermalGuard::new(80.0, 70.0, 10_000, 2.0);
assert_eq!(guard.max_temp_c, 80.0);
assert_eq!(guard.cooldown_threshold_c, 70.0);
assert_eq!(guard.cooldown_sleep_ms, 10_000);
assert_eq!(guard.temp_variance_c, 2.0);
}
#[test]
fn test_thermal_guard_default() {
let guard = ThermalGuard::default();
assert_eq!(guard.max_temp_c, 80.0);
assert_eq!(guard.cooldown_threshold_c, 70.0);
assert_eq!(guard.cooldown_sleep_ms, 10_000);
assert_eq!(guard.temp_variance_c, 2.0);
}
#[test]
fn test_thermal_validity_valid() {
let guard = ThermalGuard::default();
let temps = vec![75.0, 76.0, 75.5, 76.5, 75.2]; let result = guard.validate_run(&temps);
assert!(matches!(result, ThermalValidity::Valid));
}
#[test]
fn test_thermal_validity_invalid_high_variance() {
let guard = ThermalGuard::default();
let temps = vec![60.0, 80.0, 65.0, 85.0, 70.0]; let result = guard.validate_run(&temps);
assert!(matches!(result, ThermalValidity::Invalid(_)));
}
#[test]
fn test_thermal_needs_cooldown_above_max() {
let guard = ThermalGuard::default();
assert!(guard.needs_cooldown(85.0)); }
#[test]
fn test_thermal_needs_cooldown_below_max() {
let guard = ThermalGuard::default();
assert!(!guard.needs_cooldown(75.0)); }
#[test]
fn test_quality_result_pass() {
let result = QualityResult::Pass {
kl_divergence: 0.001,
};
match result {
QualityResult::Pass { kl_divergence } => assert!(kl_divergence < 0.01),
QualityResult::Fail { .. } => panic!("Expected Pass"),
}
}
#[test]
fn test_quality_result_fail() {
let result = QualityResult::Fail {
kl_divergence: 0.1,
threshold: 0.05,
message: "Degradation detected",
};
match result {
QualityResult::Fail {
kl_divergence,
threshold,
message,
} => {
assert!(kl_divergence > threshold);
assert!(!message.is_empty());
},
QualityResult::Pass { .. } => panic!("Expected Fail"),
}
}
#[test]
fn test_validate_quantization_identical() {
let fp32_logits: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let quant_logits: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let result = validate_quantization_quality(&fp32_logits, &quant_logits, 0.01);
assert!(matches!(result, QualityResult::Pass { .. }));
}
#[test]
fn test_validate_quantization_slight_difference() {
let fp32_logits: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let quant_logits: Vec<f32> = vec![1.01, 2.01, 3.01, 4.01]; let result = validate_quantization_quality(&fp32_logits, &quant_logits, 0.05);
assert!(matches!(result, QualityResult::Pass { .. }));
}
#[test]
fn test_validate_quantization_large_difference() {
let fp32_logits: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let quant_logits: Vec<f32> = vec![4.0, 3.0, 2.0, 1.0]; let result = validate_quantization_quality(&fp32_logits, &quant_logits, 0.01);
assert!(matches!(result, QualityResult::Fail { .. }));
}
#[test]
fn test_softmax_basic() {
let logits: Vec<f32> = vec![1.0, 2.0, 3.0];
let probs = softmax(&logits);
let sum: f64 = probs.iter().sum();
assert!((sum - 1.0).abs() < 1e-10);
assert!(probs[2] > probs[1]);
assert!(probs[1] > probs[0]);
}
#[cfg(feature = "bench-http")]
#[test]
fn test_ollama_backend_creation() {
let config = OllamaConfig {
base_url: "http://localhost:11434".to_string(),
model: "llama2".to_string(),
};
let backend = OllamaBackend::new(config);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::Ollama);
}
#[cfg(feature = "bench-http")]
#[test]
fn test_ollama_backend_info() {
let config = OllamaConfig {
base_url: "http://localhost:11434".to_string(),
model: "phi2:2.7b".to_string(),
};
let backend = OllamaBackend::new(config);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::Ollama);
assert!(info.supports_streaming);
assert_eq!(info.loaded_model, Some("phi2:2.7b".to_string()));
}
#[cfg(feature = "bench-http")]
#[test]
fn test_ollama_backend_connection_error() {
let config = OllamaConfig {
base_url: "http://localhost:59999".to_string(),
model: "test".to_string(),
};
let backend = OllamaBackend::new(config);
let request = InferenceRequest::new("test");
let result = backend.inference(&request);
assert!(result.is_err());
}
#[cfg(feature = "bench-http")]
#[test]
fn test_ollama_config_default() {
let config = OllamaConfig::default();
assert_eq!(config.base_url, "http://localhost:11434");
assert_eq!(config.model, "llama2");
}
#[cfg(feature = "bench-http")]
#[test]
fn test_ollama_backend_with_custom_client() {
use crate::http_client::ModelHttpClient;
let config = OllamaConfig {
base_url: "http://localhost:11434".to_string(),
model: "llama2".to_string(),
};
let client = ModelHttpClient::with_timeout(30);
let backend = OllamaBackend::with_client(config, client);
let info = backend.info();
assert_eq!(info.runtime_type, RuntimeType::Ollama);
}
#[cfg(feature = "bench-http")]
#[test]
#[ignore = "Requires Ollama server at localhost:11434"]
fn test_ollama_backend_real_inference() {
let config = OllamaConfig {
base_url: "http://localhost:11434".to_string(),
model: "phi2:2.7b".to_string(),
};
let backend = OllamaBackend::new(config);
let request = InferenceRequest::new("What is 2+2?")
.with_max_tokens(20)
.with_temperature(0.1);
let result = backend.inference(&request);
let response = result.expect("Ollama inference failed - is server running?");
assert!(
response.ttft_ms > 0.0,
"TTFT must be positive (real latency)"
);
assert!(response.total_time_ms > 0.0, "Total time must be positive");
assert!(response.tokens_generated > 0, "Must generate tokens");
assert!(!response.text.is_empty(), "Must get actual text");
println!("Ollama Real Inference via Backend:");
println!(" TTFT: {:.2}ms", response.ttft_ms);
println!(" Total: {:.2}ms", response.total_time_ms);
println!(" Tokens: {}", response.tokens_generated);
println!(" Text: {}", response.text);
}
#[test]
fn test_distributed_bench_config_default() {
let config = DistributedBenchConfig::default();
assert_eq!(config.gpu_counts, vec![1, 2, 4, 8]);
assert_eq!(config.iterations, 100);
assert_eq!(config.warmup, 10);
assert_eq!(config.model_params, 7_000_000_000);
assert_eq!(config.seq_len, 2048);
assert_eq!(config.batch_size, 1);
assert!((config.efficiency_threshold - 0.85).abs() < 0.001);
}
#[test]
fn test_distributed_bench_config_small_model() {
let config = DistributedBenchConfig::for_small_model();
assert_eq!(config.gpu_counts, vec![1, 2]);
assert_eq!(config.model_params, 125_000_000);
assert!((config.efficiency_threshold - 0.80).abs() < 0.001);
}
#[test]
fn test_distributed_bench_config_large_model() {
let config = DistributedBenchConfig::for_large_model();
assert_eq!(config.gpu_counts, vec![2, 4, 8]);
assert_eq!(config.model_params, 70_000_000_000);
assert_eq!(config.seq_len, 4096);
}
#[test]
fn test_distributed_bench_suite_new() {
let config = DistributedBenchConfig::default();
let suite = DistributedBenchSuite::new(config.clone());
assert_eq!(suite.config().gpu_counts, config.gpu_counts);
assert!(suite.scaling_results().is_empty());
assert!(suite.tp_results().is_empty());
assert!(suite.pp_results().is_empty());
assert!(suite.comm_results().is_empty());
}
#[test]
fn test_distributed_bench_scaling() {
let config = DistributedBenchConfig::default();
let mut suite = DistributedBenchSuite::new(config);
suite.run_scaling_benchmark();
let results = suite.scaling_results();
assert_eq!(results.len(), 4);
assert_eq!(results[0].gpu_count, 1);
assert!((results[0].efficiency - 1.0).abs() < 0.001);
assert!(results[0].comm_overhead_ms.abs() < 0.001);
for result in results.iter().skip(1) {
assert!(result.efficiency < 1.0);
assert!(result.efficiency > 0.0); assert!(result.comm_overhead_ms > 0.0);
assert!(result.throughput_tps > 0.0);
assert!(result.latency_p50_ms > 0.0);
assert!(result.latency_p99_ms > result.latency_p50_ms);
}
let gpu2 = results.iter().find(|r| r.gpu_count == 2).unwrap();
assert!(gpu2.efficiency > 0.85, "2-GPU efficiency should be >85%");
}
#[test]
fn test_scaling_efficiency_result_meets_threshold() {
let result = ScalingEfficiencyResult {
gpu_count: 4,
throughput_tps: 400.0,
latency_p50_ms: 2.5,
latency_p99_ms: 3.75,
efficiency: 0.90,
comm_overhead_ms: 0.5,
theoretical_speedup: 3.6,
achieved_speedup: 3.4,
};
assert!(result.meets_threshold(0.85));
assert!(result.meets_threshold(0.90));
assert!(!result.meets_threshold(0.95));
}
#[test]
fn test_scaling_efficiency_parallel_fraction() {
let result = ScalingEfficiencyResult {
gpu_count: 4,
throughput_tps: 400.0,
latency_p50_ms: 2.5,
latency_p99_ms: 3.75,
efficiency: 0.85,
comm_overhead_ms: 0.5,
theoretical_speedup: 3.6,
achieved_speedup: 3.4,
};
let parallel = result.parallel_fraction();
assert!(parallel > 0.8); assert!(parallel <= 1.0);
let single = ScalingEfficiencyResult {
gpu_count: 1,
throughput_tps: 100.0,
latency_p50_ms: 10.0,
latency_p99_ms: 15.0,
efficiency: 1.0,
comm_overhead_ms: 0.0,
theoretical_speedup: 1.0,
achieved_speedup: 1.0,
};
assert!((single.parallel_fraction() - 1.0).abs() < 0.001);
}
#[test]
fn test_distributed_bench_tensor_parallel() {
let config = DistributedBenchConfig::default();
let mut suite = DistributedBenchSuite::new(config);
suite.run_tensor_parallel_benchmark();
let results = suite.tp_results();
assert!(!results.is_empty());
let tp1 = results.iter().find(|r| r.tp_degree == 1).unwrap();
assert!(tp1.all_reduce_ms.abs() < 0.001);
assert!(tp1.comm_overhead_pct.abs() < 0.001);
for result in results.iter().filter(|r| r.tp_degree > 1) {
assert!(result.all_reduce_ms > 0.0);
assert!(result.comm_overhead_pct > 0.0);
assert!(result.memory_per_gpu_mb > 0.0);
assert!(result.effective_tflops > 0.0);
}
}
#[test]
fn test_distributed_bench_pipeline_parallel() {
let config = DistributedBenchConfig::default();
let mut suite = DistributedBenchSuite::new(config);
suite.run_pipeline_parallel_benchmark();
let results = suite.pp_results();
assert!(!results.is_empty());
let pp1 = results.iter().find(|r| r.pp_degree == 1).unwrap();
assert!(pp1.bubble_ratio.abs() < 0.001);
assert!(pp1.inter_stage_ms.abs() < 0.001);
for result in results.iter().filter(|r| r.pp_degree > 1) {
assert!(result.bubble_ratio > 0.0);
assert!(result.bubble_ratio < 1.0); assert!(result.inter_stage_ms > 0.0);
assert!(result.micro_batches > 0);
assert!(result.throughput_tps > 0.0);
assert!(result.memory_per_stage_mb > 0.0);
}
}
#[test]
fn test_distributed_bench_communication() {
let config = DistributedBenchConfig::default();
let mut suite = DistributedBenchSuite::new(config);
suite.run_communication_benchmark();
let results = suite.comm_results();
assert_eq!(results.len(), 8);
for result in results {
assert!(result.latency_us > 0.0);
assert!(result.bandwidth_gbps > 0.0);
assert!(result.world_size > 0);
assert!(!result.operation.is_empty());
assert!(result.data_size_bytes > 0);
}
let reduce_1kb = results
.iter()
.find(|r| r.operation == "all_reduce" && r.data_size_bytes == 1024)
.unwrap();
let gather_1kb = results
.iter()
.find(|r| r.operation == "all_gather" && r.data_size_bytes == 1024)
.unwrap();
assert!(gather_1kb.latency_us < reduce_1kb.latency_us);
}
#[test]
fn test_distributed_bench_run_all() {
let config = DistributedBenchConfig::for_small_model();
let mut suite = DistributedBenchSuite::new(config);
suite.run_all();
assert!(!suite.scaling_results().is_empty());
assert!(!suite.tp_results().is_empty());
assert!(!suite.pp_results().is_empty());
assert!(!suite.comm_results().is_empty());
}
#[test]
fn test_distributed_bench_summary() {
let config = DistributedBenchConfig::default();
let mut suite = DistributedBenchSuite::new(config);
suite.run_all();
let summary = suite.summary();
assert_eq!(summary.max_scaling, 8);
assert!(summary.max_efficiency > 0.0);
assert!(summary.min_efficiency > 0.0);
assert!(summary.max_efficiency >= summary.min_efficiency);
assert!(summary.max_throughput_tps > 0.0);
assert!(summary.avg_tp_comm_overhead_pct >= 0.0);
assert!(summary.avg_pp_bubble_ratio >= 0.0);
}
#[test]
fn test_distributed_bench_all_meet_threshold() {
let config = DistributedBenchConfig::for_small_model();
let mut suite = DistributedBenchSuite::new(config);
suite.run_scaling_benchmark();
assert!(suite.all_meet_efficiency_threshold());
}
#[test]
fn test_distributed_bench_fail_threshold() {
let config = DistributedBenchConfig {
efficiency_threshold: 0.99, ..DistributedBenchConfig::default()
};
let mut suite = DistributedBenchSuite::new(config);
suite.run_scaling_benchmark();
assert!(!suite.all_meet_efficiency_threshold());
}
#[test]
fn test_distributed_bench_empty_summary() {
let config = DistributedBenchConfig::default();
let suite = DistributedBenchSuite::new(config);
let summary = suite.summary();
assert_eq!(summary.max_scaling, 1);
assert!((summary.max_efficiency - 0.0).abs() < 0.001);
assert!((summary.avg_tp_comm_overhead_pct - 0.0).abs() < 0.001);
assert!((summary.avg_pp_bubble_ratio - 0.0).abs() < 0.001);
}
#[test]
fn test_load_test_config_default() {
let config = LoadTestConfig::default();
assert_eq!(config.concurrency, 10);
assert_eq!(config.duration_secs, 60);
assert!((config.target_rps - 0.0).abs() < 0.001);
assert_eq!(config.timeout_ms, 5000);
assert_eq!(config.warmup_secs, 5);
assert!((config.latency_threshold_ms - 500.0).abs() < 0.001);
}
#[test]
fn test_load_test_config_stress_test() {
let config = LoadTestConfig::for_stress_test();
assert_eq!(config.concurrency, 100);
assert_eq!(config.duration_secs, 300);
assert!((config.latency_threshold_ms - 1000.0).abs() < 0.001);
}
#[test]
fn test_load_test_config_latency_test() {
let config = LoadTestConfig::for_latency_test();
assert_eq!(config.concurrency, 1);
assert!((config.target_rps - 10.0).abs() < 0.001);
assert!((config.latency_threshold_ms - 200.0).abs() < 0.001);
}
#[test]
fn test_load_test_config_validation() {
let valid = LoadTestConfig::default();
assert!(valid.is_valid());
let invalid = LoadTestConfig {
concurrency: 0,
..LoadTestConfig::default()
};
assert!(!invalid.is_valid());
}
#[test]
fn test_load_test_runner_simulate() {
let config = LoadTestConfig::default();
let runner = LoadTestRunner::new(config);
let result = runner.simulate_run();
assert!(result.total_requests > 0);
assert!(result.successful_requests > 0);
assert!(result.rps_achieved > 0.0);
assert!(result.latency_p50_ms > 0.0);
assert!(result.latency_p95_ms > result.latency_p50_ms);
assert!(result.latency_p99_ms > result.latency_p95_ms);
assert!(result.latency_max_ms > result.latency_p99_ms);
assert!(result.data_transferred_bytes > 0);
assert!(result.duration_secs > 0.0);
assert!(result.error_rate >= 0.0 && result.error_rate < 1.0);
}
#[test]
fn test_load_test_result_is_passing() {
let passing = LoadTestResult {
total_requests: 1000,
successful_requests: 995,
failed_requests: 5,
rps_achieved: 100.0,
latency_p50_ms: 20.0,
latency_p95_ms: 50.0,
latency_p99_ms: 80.0,
latency_max_ms: 200.0,
data_transferred_bytes: 1_000_000,
duration_secs: 10.0,
error_rate: 0.005,
passed_latency_threshold: true,
};
assert!(passing.is_passing());
let failing_error_rate = LoadTestResult {
error_rate: 0.05, ..passing.clone()
};
assert!(!failing_error_rate.is_passing());
let failing_latency = LoadTestResult {
passed_latency_threshold: false,
..passing
};
assert!(!failing_latency.is_passing());
}
#[test]
fn test_load_test_result_throughput() {
let result = LoadTestResult {
total_requests: 1000,
successful_requests: 1000,
failed_requests: 0,
rps_achieved: 100.0,
latency_p50_ms: 20.0,
latency_p95_ms: 50.0,
latency_p99_ms: 80.0,
latency_max_ms: 200.0,
data_transferred_bytes: 10_000_000, duration_secs: 10.0,
error_rate: 0.0,
passed_latency_threshold: true,
};
assert!((result.throughput_mbps() - 1.0).abs() < 0.001); }
#[test]
fn test_parse_llama_cli_timing_prompt_eval() {
let output = r"llama_perf_context_print: prompt eval time = 12.34 ms / 10 tokens ( 1.23 ms per token, 810.37 tokens per second)";
let timing = LlamaCppBackend::parse_timing_line(output, "prompt eval time");
assert!(timing.is_some());
let (total_ms, tokens) = timing.unwrap();
assert!((total_ms - 12.34).abs() < 0.01);
assert_eq!(tokens, 10);
}
#[test]
fn test_parse_llama_cli_timing_eval() {
let output = r"llama_perf_context_print: eval time = 22.60 ms / 5 runs ( 4.52 ms per token, 221.28 tokens per second)";
let timing = LlamaCppBackend::parse_timing_line(output, "eval time");
assert!(timing.is_some());
let (total_ms, runs) = timing.unwrap();
assert!((total_ms - 22.60).abs() < 0.01);
assert_eq!(runs, 5);
}
#[test]
fn test_parse_llama_cli_timing_total() {
let output = r"llama_perf_context_print: total time = 23.27 ms / 6 tokens";
let timing = LlamaCppBackend::parse_timing_line(output, "total time");
assert!(timing.is_some());
let (total_ms, tokens) = timing.unwrap();
assert!((total_ms - 23.27).abs() < 0.01);
assert_eq!(tokens, 6);
}
#[test]
fn test_parse_llama_cli_full_output() {
let output = r#"Hello world",
```
llama_perf_sampler_print: sampling time = 0.14 ms / 6 runs ( 0.02 ms per token, 42553.19 tokens per second)
llama_perf_context_print: load time = 1349.68 ms
llama_perf_context_print: prompt eval time = 5.00 ms / 1 tokens ( 5.00 ms per token, 200.00 tokens per second)
llama_perf_context_print: eval time = 22.60 ms / 5 runs ( 4.52 ms per token, 221.28 tokens per second)
llama_perf_context_print: total time = 27.60 ms / 6 tokens"#;
let result = LlamaCppBackend::parse_cli_output(output);
assert!(result.is_ok());
let response = result.unwrap();
assert!((response.ttft_ms - 5.0).abs() < 0.1);
assert!((response.total_time_ms - 27.60).abs() < 0.1);
assert_eq!(response.tokens_generated, 5);
assert!(response.text.contains("Hello world"));
}
#[test]
fn test_parse_llama_cli_extract_generated_text() {
let output =
"The answer is 42.\n\nllama_perf_context_print: total time = 100.0 ms / 10 tokens";
let text = LlamaCppBackend::extract_generated_text(output);
assert_eq!(text, "The answer is 42.");
}
#[test]
fn test_llama_cpp_backend_build_command() {
let config = LlamaCppConfig {
binary_path: "/path/to/llama-cli".to_string(),
model_path: Some("/path/to/model.gguf".to_string()),
n_gpu_layers: 99,
ctx_size: 4096,
threads: 8,
};
let backend = LlamaCppBackend::new(config);
let request = InferenceRequest {
prompt: "Hello".to_string(),
max_tokens: 10,
temperature: 0.7,
stop: vec![],
};
let args = backend.build_cli_args(&request);
assert!(args.contains(&"-m".to_string()));
assert!(args.contains(&"/path/to/model.gguf".to_string()));
assert!(args.contains(&"-p".to_string()));
assert!(args.contains(&"Hello".to_string()));
assert!(args.contains(&"-n".to_string()));
assert!(args.contains(&"10".to_string()));
assert!(args.contains(&"-ngl".to_string()));
assert!(args.contains(&"99".to_string()));
assert!(args.contains(&"-c".to_string()));
assert!(args.contains(&"4096".to_string()));
assert!(args.contains(&"-t".to_string()));
assert!(args.contains(&"8".to_string()));
}
#[test]
fn test_llama_cpp_backend_no_model_path_error() {
let config = LlamaCppConfig {
binary_path: "/path/to/llama-cli".to_string(),
model_path: None, n_gpu_layers: 0,
ctx_size: 2048,
threads: 4,
};
let backend = LlamaCppBackend::new(config);
let request = InferenceRequest {
prompt: "Hello".to_string(),
max_tokens: 10,
temperature: 0.7,
stop: vec![],
};
let result = backend.inference(&request);
assert!(result.is_err());
}
#[test]
fn test_compute_backend_type_display() {
assert_eq!(format!("{}", ComputeBackendType::Cpu), "cpu");
assert_eq!(format!("{}", ComputeBackendType::Wgpu), "wgpu");
assert_eq!(format!("{}", ComputeBackendType::Cuda), "cuda");
}
#[test]
fn test_compute_backend_type_from_str() {
assert_eq!(
ComputeBackendType::parse("cpu"),
Some(ComputeBackendType::Cpu)
);
assert_eq!(
ComputeBackendType::parse("CPU"),
Some(ComputeBackendType::Cpu)
);
assert_eq!(
ComputeBackendType::parse("wgpu"),
Some(ComputeBackendType::Wgpu)
);
assert_eq!(
ComputeBackendType::parse("gpu"),
Some(ComputeBackendType::Wgpu)
);
assert_eq!(
ComputeBackendType::parse("cuda"),
Some(ComputeBackendType::Cuda)
);
assert_eq!(
ComputeBackendType::parse("nvidia"),
Some(ComputeBackendType::Cuda)
);
assert_eq!(ComputeBackendType::parse("unknown"), None);
}
#[test]
fn test_compute_backend_type_all() {
let all = ComputeBackendType::all();
assert_eq!(all.len(), 3);
assert!(all.contains(&ComputeBackendType::Cpu));
assert!(all.contains(&ComputeBackendType::Wgpu));
assert!(all.contains(&ComputeBackendType::Cuda));
}
#[test]
fn test_matrix_benchmark_entry_unavailable() {
let entry =
MatrixBenchmarkEntry::unavailable(RuntimeType::Realizar, ComputeBackendType::Cuda);
assert!(!entry.available);
assert_eq!(entry.runtime, RuntimeType::Realizar);
assert_eq!(entry.backend, ComputeBackendType::Cuda);
assert!(entry.notes.contains("not available"));
}
#[test]
fn test_matrix_benchmark_entry_from_samples() {
let latencies = vec![100.0, 105.0, 110.0, 95.0, 102.0];
let throughputs = vec![50.0, 48.0, 52.0, 49.0, 51.0];
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Wgpu,
"phi-2",
&latencies,
&throughputs,
150.0,
);
assert!(entry.available);
assert_eq!(entry.runtime, RuntimeType::LlamaCpp);
assert_eq!(entry.backend, ComputeBackendType::Wgpu);
assert_eq!(entry.model, "phi-2");
assert_eq!(entry.samples, 5);
assert!((entry.p50_latency_ms - 102.0).abs() < 1.0); assert!((entry.cold_start_ms - 150.0).abs() < 0.1);
assert!(entry.throughput_tps > 0.0);
}
#[test]
fn test_matrix_benchmark_entry_from_empty_samples() {
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"model",
&[],
&[],
0.0,
);
assert!(!entry.available);
}
#[test]
fn test_matrix_benchmark_entry_with_notes() {
let entry =
MatrixBenchmarkEntry::unavailable(RuntimeType::Realizar, ComputeBackendType::Cuda)
.with_notes("GPU layers: 99");
assert_eq!(entry.notes, "GPU layers: 99");
}
#[test]
fn test_benchmark_matrix_creation() {
let hardware = HardwareSpec::default();
let matrix = BenchmarkMatrix::new("phi-2", hardware);
assert_eq!(matrix.model, "phi-2");
assert_eq!(matrix.version, "1.1");
assert!(matrix.methodology.contains("Hoefler"));
assert!(matrix.entries.is_empty());
}
#[test]
fn test_benchmark_matrix_add_entry() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
let entry1 = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0, 102.0, 98.0],
&[50.0, 51.0, 49.0],
100.0,
);
matrix.add_entry(entry1);
assert_eq!(matrix.entries.len(), 1);
let entry2 = MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Wgpu,
"phi-2",
&[80.0, 82.0, 78.0],
&[60.0, 61.0, 59.0],
120.0,
);
matrix.add_entry(entry2);
assert_eq!(matrix.entries.len(), 2);
let entry1_updated = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[90.0, 92.0, 88.0],
&[55.0, 56.0, 54.0],
95.0,
);
matrix.add_entry(entry1_updated);
assert_eq!(matrix.entries.len(), 2); }
#[test]
fn test_benchmark_matrix_get_entry() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0],
&[50.0],
100.0,
);
matrix.add_entry(entry);
let found = matrix.get_entry(RuntimeType::Realizar, ComputeBackendType::Cpu);
assert!(found.is_some());
assert_eq!(found.unwrap().runtime, RuntimeType::Realizar);
let not_found = matrix.get_entry(RuntimeType::LlamaCpp, ComputeBackendType::Cuda);
assert!(not_found.is_none());
}
#[test]
fn test_benchmark_matrix_entries_for_runtime() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0],
&[50.0],
100.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Wgpu,
"phi-2",
&[80.0],
&[60.0],
90.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Cpu,
"phi-2",
&[90.0],
&[55.0],
95.0,
));
let realizar_entries = matrix.entries_for_runtime(RuntimeType::Realizar);
assert_eq!(realizar_entries.len(), 2);
let llama_entries = matrix.entries_for_runtime(RuntimeType::LlamaCpp);
assert_eq!(llama_entries.len(), 1);
}
#[test]
fn test_benchmark_matrix_entries_for_backend() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0],
&[50.0],
100.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Cpu,
"phi-2",
&[90.0],
&[55.0],
95.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Wgpu,
"phi-2",
&[80.0],
&[60.0],
90.0,
));
let cpu_entries = matrix.entries_for_backend(ComputeBackendType::Cpu);
assert_eq!(cpu_entries.len(), 2);
let wgpu_entries = matrix.entries_for_backend(ComputeBackendType::Wgpu);
assert_eq!(wgpu_entries.len(), 1);
let cuda_entries = matrix.entries_for_backend(ComputeBackendType::Cuda);
assert!(cuda_entries.is_empty());
}
#[test]
fn test_benchmark_matrix_fastest_for_backend() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0, 102.0, 98.0],
&[50.0],
100.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Cpu,
"phi-2",
&[80.0, 82.0, 78.0], &[55.0],
95.0,
));
let fastest = matrix.fastest_for_backend(ComputeBackendType::Cpu);
assert!(fastest.is_some());
assert_eq!(fastest.unwrap().runtime, RuntimeType::LlamaCpp);
}
#[test]
fn test_benchmark_matrix_highest_throughput_for_backend() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0],
&[50.0, 51.0, 49.0],
100.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Cpu,
"phi-2",
&[90.0],
&[70.0, 71.0, 69.0], 95.0,
));
let highest = matrix.highest_throughput_for_backend(ComputeBackendType::Cpu);
assert!(highest.is_some());
assert_eq!(highest.unwrap().runtime, RuntimeType::LlamaCpp);
}
#[test]
fn test_benchmark_matrix_to_markdown_table() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0, 110.0, 105.0],
&[50.0, 51.0, 49.0],
100.0,
));
matrix.add_entry(MatrixBenchmarkEntry::unavailable(
RuntimeType::Realizar,
ComputeBackendType::Cuda,
));
let table = matrix.to_markdown_table();
assert!(table.contains("| Runtime | Backend |"));
assert!(table.contains("| **realizar** |"));
assert!(table.contains("| - | - |")); }
#[test]
fn test_benchmark_matrix_json_roundtrip() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0, 102.0, 98.0],
&[50.0, 51.0, 49.0],
100.0,
));
let json = matrix.to_json().expect("serialization should succeed");
assert!(json.contains("\"model\": \"phi-2\""));
let parsed = BenchmarkMatrix::from_json(&json).expect("deserialization should succeed");
assert_eq!(parsed.model, "phi-2");
assert_eq!(parsed.entries.len(), 1);
}
#[test]
fn test_benchmark_matrix_summary() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("phi-2", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"phi-2",
&[100.0, 102.0, 98.0],
&[50.0, 51.0, 49.0],
100.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Cpu,
"phi-2",
&[80.0, 82.0, 78.0],
&[70.0, 71.0, 69.0],
95.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Wgpu,
"phi-2",
&[60.0, 62.0, 58.0], &[80.0, 81.0, 79.0], 90.0,
));
matrix.add_entry(MatrixBenchmarkEntry::unavailable(
RuntimeType::Realizar,
ComputeBackendType::Cuda,
));
let summary = matrix.summary();
assert_eq!(summary.total_entries, 4);
assert_eq!(summary.available_entries, 3);
assert!(summary.overall_fastest.is_some());
let (fastest_runtime, fastest_backend) = summary.overall_fastest.unwrap();
assert_eq!(fastest_runtime, "llamacpp");
assert_eq!(fastest_backend, "wgpu");
assert!(summary.overall_highest_throughput.is_some());
let (tp_runtime, tp_backend) = summary.overall_highest_throughput.unwrap();
assert_eq!(tp_runtime, "llamacpp");
assert_eq!(tp_backend, "wgpu");
}
#[test]
fn test_matrix_benchmark_config_default() {
let config = MatrixBenchmarkConfig::default();
assert!(config.runtimes.contains(&RuntimeType::Realizar));
assert!(config.runtimes.contains(&RuntimeType::LlamaCpp));
assert!(config.runtimes.contains(&RuntimeType::Ollama));
assert!(config.backends.contains(&ComputeBackendType::Cpu));
assert!(config.backends.contains(&ComputeBackendType::Wgpu));
assert_eq!(config.cv_threshold, 0.05);
assert_eq!(config.min_samples, 30);
assert_eq!(config.max_samples, 200);
assert_eq!(config.warmup_iterations, 5);
}
#[test]
fn test_qa_031_benchmark_statistical_validity() {
let sampler = DynamicSampler::new(10, 100, 0.05);
let stable_samples: Vec<f64> = (0..50).map(|_| 100.0).collect();
let cv = sampler.current_cv(&stable_samples);
assert!(
cv.abs() < 0.001,
"QA-031: Stable samples should have near-zero CV, got {}",
cv
);
let variable_samples: Vec<f64> = (0..50).map(|i| 50.0 + (i as f64) * 2.0).collect();
let cv_var = sampler.current_cv(&variable_samples);
assert!(
cv_var > 0.1,
"QA-031: Variable samples should have measurable CV, got {}",
cv_var
);
}
#[test]
fn test_qa_032_thermal_guard_validation() {
let guard = ThermalGuard::default();
assert!(
guard.max_temp_c > 70.0 && guard.max_temp_c <= 95.0,
"QA-032: Max temp should be in safe GPU range"
);
assert!(
guard.cooldown_threshold_c < guard.max_temp_c,
"QA-032: Cooldown threshold must be below max temp"
);
assert!(
guard.temp_variance_c > 0.0 && guard.temp_variance_c <= 5.0,
"QA-032: Temperature variance threshold should be reasonable"
);
}
#[test]
fn test_qa_033_itl_variance_capture() {
let samples = vec![10.0, 12.0, 11.0, 13.0, 10.0, 15.0, 11.0, 12.0];
let metrics = ItlMetrics::from_measurements(&samples);
assert!(
metrics.p999_ms >= metrics.p99_ms,
"QA-033: p999 should be >= p99"
);
assert!(
metrics.p99_ms >= metrics.median_ms,
"QA-033: p99 should be >= median"
);
assert!(metrics.median_ms > 0.0, "QA-033: Median should be positive");
assert!(
metrics.std_dev_ms >= 0.0,
"QA-033: Std dev should be non-negative"
);
}
#[test]
#[allow(clippy::similar_names)] fn test_qa_034_cv_stopping_convergence() {
let mut sampler = DynamicSampler::new(10, 1000, 0.05);
sampler.stability_count = 3;
let mut samples = Vec::new();
let mut stopped = false;
for i in 0..100 {
samples.push(100.0 + (i as f64 % 3.0)); if !sampler.should_continue(&samples) {
stopped = true;
break;
}
}
assert!(
stopped,
"QA-034: CV-based stopping should converge for stable samples"
);
}
#[test]
fn test_qa_035_benchmark_serialization() {
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test-model",
&[50.0, 52.0, 48.0],
&[100.0, 98.0, 102.0],
95.0,
);
let json = serde_json::to_string(&entry);
assert!(
json.is_ok(),
"QA-035: MatrixBenchmarkEntry should serialize"
);
let deser: Result<MatrixBenchmarkEntry, _> = serde_json::from_str(&json.unwrap());
assert!(
deser.is_ok(),
"QA-035: MatrixBenchmarkEntry should deserialize"
);
}
#[test]
fn test_qa_036_runtime_backend_completeness() {
let runtimes = [
RuntimeType::Realizar,
RuntimeType::LlamaCpp,
RuntimeType::Ollama,
RuntimeType::Vllm,
];
for runtime in &runtimes {
let name = runtime.as_str();
assert!(
!name.is_empty(),
"QA-036: Runtime {} should have a name",
name
);
}
let backends = [
ComputeBackendType::Cpu,
ComputeBackendType::Cuda,
ComputeBackendType::Wgpu,
];
for backend in &backends {
let name = backend.to_string();
assert!(
!name.is_empty(),
"QA-036: Backend {:?} should have a name",
backend
);
}
}
#[test]
fn test_qa_037_matrix_summary_correctness() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("test-model", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test",
&[100.0], &[10.0], 90.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Cpu,
"test",
&[50.0], &[20.0], 95.0,
));
let summary = matrix.summary();
assert_eq!(summary.total_entries, 2, "QA-037: Should have 2 entries");
assert_eq!(
summary.available_entries, 2,
"QA-037: Both entries should be available"
);
if let Some((fastest, _)) = &summary.overall_fastest {
assert_eq!(fastest, "llamacpp", "QA-037: LlamaCpp should be fastest");
}
}
#[test]
fn test_qa_038_report_generation() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("test-model", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test-model",
&[100.0],
&[50.0],
90.0,
));
let report = matrix.to_markdown_table();
assert!(
report.contains("realizar") || report.contains("Realizar"),
"QA-038: Report should mention realizar"
);
}
#[test]
fn test_qa_039_sampler_bounds() {
let mut sampler = DynamicSampler::new(5, 20, 0.01);
let few_samples = vec![1.0, 2.0, 3.0];
assert!(
sampler.should_continue(&few_samples),
"QA-039: Should continue below min_samples"
);
let many_samples: Vec<f64> = (0..25).map(|i| i as f64).collect(); assert!(
!sampler.should_continue(&many_samples),
"QA-039: Should stop at max_samples"
);
}
#[test]
fn test_qa_040_itl_edge_cases() {
let single = ItlMetrics::from_measurements(&[100.0]);
assert!(
(single.median_ms - 100.0).abs() < 0.001,
"QA-040: Single sample median should equal the sample"
);
let empty = ItlMetrics::from_measurements(&[]);
assert!(
empty.median_ms.is_nan() || empty.median_ms == 0.0,
"QA-040: Empty samples should produce NaN or 0"
);
let same = ItlMetrics::from_measurements(&[50.0, 50.0, 50.0, 50.0]);
assert!(
same.std_dev_ms.abs() < 0.001,
"QA-040: Identical samples should have zero std_dev"
);
}
#[test]
fn test_qa_041_benchmark_infrastructure() {
let runtimes = [
RuntimeType::Realizar,
RuntimeType::Ollama,
RuntimeType::LlamaCpp,
];
for runtime in &runtimes {
assert!(
!runtime.as_str().is_empty(),
"QA-041: Runtime {} should have a name",
runtime.as_str()
);
}
let hardware = HardwareSpec::default();
let matrix = BenchmarkMatrix::new("test-model", hardware);
assert!(
matrix.entries.is_empty(),
"QA-041: New matrix should be empty"
);
}
#[test]
fn test_qa_042_comparison_report() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("test-model", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test",
&[100.0, 105.0, 95.0],
&[50.0, 55.0, 45.0],
90.0,
));
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::LlamaCpp,
ComputeBackendType::Cpu,
"test",
&[80.0, 85.0, 75.0],
&[40.0, 45.0, 35.0],
110.0,
));
let report = matrix.to_markdown_table();
assert!(
report.contains("realizar") || report.contains("Realizar"),
"QA-042: Report should include Realizar"
);
}
#[test]
fn test_qa_043_cpu_benchmarks() {
let cpu_backend = ComputeBackendType::Cpu;
let backend_str = cpu_backend.to_string();
assert!(
backend_str.to_lowercase().contains("cpu"),
"QA-043: CPU backend should be identifiable"
);
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test-model",
&[100.0],
&[50.0],
90.0,
);
assert_eq!(
entry.backend,
ComputeBackendType::Cpu,
"QA-043: Entry should be CPU backend"
);
}
#[test]
fn test_qa_044_wgpu_graceful_skip() {
let wgpu_backend = ComputeBackendType::Wgpu;
let backend_str = wgpu_backend.to_string();
assert!(
!backend_str.is_empty(),
"QA-044: WGPU backend should have a name"
);
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Wgpu,
"test-model",
&[100.0],
&[50.0],
90.0,
);
assert_eq!(
entry.backend,
ComputeBackendType::Wgpu,
"QA-044: Entry should be WGPU backend"
);
}
#[test]
fn test_qa_045_multi_runtime_comparison() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("test-model", hardware);
for runtime in [
RuntimeType::Realizar,
RuntimeType::Ollama,
RuntimeType::LlamaCpp,
] {
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
runtime,
ComputeBackendType::Cpu,
"test",
&[100.0],
&[50.0],
90.0,
));
}
assert_eq!(
matrix.entries.len(),
3,
"QA-045: Should have 3 runtime entries"
);
let summary = matrix.summary();
assert!(
summary.overall_fastest.is_some(),
"QA-045: Summary should identify fastest runtime"
);
}
#[test]
fn test_qa_046_format_comparison() {
let hardware = HardwareSpec::default();
let mut gguf_matrix = BenchmarkMatrix::new("model.gguf", hardware.clone());
let mut apr_matrix = BenchmarkMatrix::new("model.apr", hardware);
gguf_matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"model.gguf",
&[100.0],
&[50.0],
90.0,
));
apr_matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"model.apr",
&[95.0],
&[48.0],
92.0,
));
let gguf_report = gguf_matrix.to_markdown_table();
let apr_report = apr_matrix.to_markdown_table();
assert!(
!gguf_report.is_empty(),
"QA-046: GGUF report should be non-empty"
);
assert!(
!apr_report.is_empty(),
"QA-046: APR report should be non-empty"
);
}
#[test]
fn test_qa_047_ci_integration() {
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test-model",
&[100.0, 105.0],
&[50.0, 55.0],
90.0,
);
let json = serde_json::to_string(&entry);
assert!(json.is_ok(), "QA-047: Entry should serialize for CI");
let deser: Result<MatrixBenchmarkEntry, _> = serde_json::from_str(&json.unwrap());
assert!(deser.is_ok(), "QA-047: Entry should deserialize from CI");
}
#[test]
fn test_qa_048_metrics_dashboard() {
let entry = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test-model",
&[100.0, 105.0, 95.0, 98.0, 102.0],
&[50.0, 55.0, 45.0, 48.0, 52.0],
90.0,
);
assert!(
entry.p50_latency_ms > 0.0,
"QA-048: p50 should be available"
);
assert!(
entry.p99_latency_ms > 0.0,
"QA-048: p99 should be available"
);
assert!(
entry.throughput_tps > 0.0,
"QA-048: Throughput should be available"
);
assert!(
!entry.runtime.as_str().is_empty(),
"QA-048: Runtime should be identifiable"
);
}
#[test]
fn test_qa_049_trend_detection() {
let baseline = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test-model",
&[100.0, 100.0, 100.0],
&[50.0, 50.0, 50.0],
100.0,
);
let regressed = MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test-model",
&[120.0, 120.0, 120.0], &[60.0, 60.0, 60.0],
83.0, );
let regression_percent =
(regressed.p50_latency_ms - baseline.p50_latency_ms) / baseline.p50_latency_ms * 100.0;
assert!(
regression_percent > 15.0,
"QA-049: Should detect >15% regression, got {}%",
regression_percent
);
}
#[test]
fn test_qa_050_documentation_support() {
let hardware = HardwareSpec::default();
let mut matrix = BenchmarkMatrix::new("test-model", hardware);
matrix.add_entry(MatrixBenchmarkEntry::from_samples(
RuntimeType::Realizar,
ComputeBackendType::Cpu,
"test",
&[100.0],
&[50.0],
90.0,
));
let markdown = matrix.to_markdown_table();
assert!(
markdown.contains("|"),
"QA-050: Should produce markdown table"
);
assert!(
markdown.contains("Runtime") || markdown.contains("runtime"),
"QA-050: Table should have headers"
);
}
#[test]
fn test_imp800a_gpu_parity_benchmark_config() {
let config = GpuParityBenchmark::new("/path/to/model.gguf")
.with_prompt("Hello world")
.with_max_tokens(64)
.with_ollama_endpoint("http://localhost:11434")
.with_warmup(5)
.with_iterations(20);
assert_eq!(config.model_path, "/path/to/model.gguf");
assert_eq!(config.prompt, "Hello world");
assert_eq!(config.max_tokens, 64);
assert_eq!(config.ollama_endpoint, "http://localhost:11434");
assert_eq!(config.warmup_iterations, 5);
assert_eq!(config.measurement_iterations, 20);
}
#[test]
fn test_imp800a_gpu_parity_benchmark_default() {
let config = GpuParityBenchmark::default();
assert!(config.model_path.is_empty());
assert_eq!(config.prompt, "The quick brown fox");
assert_eq!(config.max_tokens, 32);
assert_eq!(config.warmup_iterations, 3);
assert_eq!(config.measurement_iterations, 10);
assert!((config.target_cv - 0.05).abs() < f64::EPSILON);
}
#[test]
fn test_imp800b_gpu_parity_result_struct() {
let result = GpuParityResult::new(150.0, 240.0, 0.03, "NVIDIA RTX 4090", 8192);
assert!((result.realizar_gpu_tps - 150.0).abs() < f64::EPSILON);
assert!((result.ollama_tps - 240.0).abs() < f64::EPSILON);
assert!((result.gap_ratio - 1.6).abs() < 0.01);
assert!((result.cv - 0.03).abs() < f64::EPSILON);
assert_eq!(result.gpu_device, "NVIDIA RTX 4090");
assert_eq!(result.vram_mb, 8192);
}
#[test]
fn test_imp800b_parity_thresholds() {
let m2_pass = GpuParityResult::new(130.0, 240.0, 0.03, "GPU", 8192);
assert!(m2_pass.achieves_m2_parity()); assert!(!m2_pass.achieves_m4_parity());
let m4_pass = GpuParityResult::new(200.0, 240.0, 0.02, "GPU", 8192);
assert!(m4_pass.achieves_m2_parity()); assert!(m4_pass.achieves_m4_parity());
let fail = GpuParityResult::new(50.0, 240.0, 0.05, "GPU", 8192);
assert!(!fail.achieves_m2_parity()); assert!(!fail.achieves_m4_parity());
}
#[test]
fn test_imp800b_cv_stability() {
let stable = GpuParityResult::new(150.0, 240.0, 0.04, "GPU", 8192);
assert!(stable.measurements_stable());
let unstable = GpuParityResult::new(150.0, 240.0, 0.08, "GPU", 8192);
assert!(!unstable.measurements_stable()); }
#[test]
fn test_imp800c_gap_analysis_struct() {
let analysis = GapAnalysis::new(2.0, 1.8).with_statistics(0.01, 1.5, 2.1);
assert!((analysis.claimed_gap - 2.0).abs() < f64::EPSILON);
assert!((analysis.measured_gap - 1.8).abs() < f64::EPSILON);
assert!((analysis.p_value - 0.01).abs() < f64::EPSILON);
assert!((analysis.ci_95_lower - 1.5).abs() < f64::EPSILON);
assert!((analysis.ci_95_upper - 2.1).abs() < f64::EPSILON);
}
#[test]
fn test_imp800c_claim_verification() {
let within_ci = GapAnalysis::new(2.0, 1.8).with_statistics(0.01, 1.5, 2.1);
assert!(within_ci.claim_verified());
let outside_ci = GapAnalysis::new(2.0, 1.2).with_statistics(0.01, 1.5, 2.1);
assert!(!outside_ci.claim_verified()); }
#[test]
fn test_imp800c_statistical_bounds() {
let analysis = GapAnalysis::new(2.0, 1.8).with_statistics(0.05, 1.6, 2.0);
assert!((analysis.ci_95_lower - 1.6).abs() < f64::EPSILON);
assert!((analysis.ci_95_upper - 2.0).abs() < f64::EPSILON);
assert!((analysis.p_value - 0.05).abs() < f64::EPSILON);
}
#[test]
fn test_imp800c_popper_score() {
let analysis = GapAnalysis::new(2.0, 1.6).with_default_claims(150.0);
assert!((analysis.popper_score - 75.0).abs() < f64::EPSILON);
assert_eq!(analysis.claims.len(), 4);
}
#[test]
fn test_imp800d_falsifiable_claim() {
let claim = FalsifiableClaim::new("TEST-001", "Test claim", 5.0, 25.0).evaluate(30.0);
assert_eq!(claim.id, "TEST-001");
assert_eq!(claim.description, "Test claim");
assert!((claim.expected - 5.0).abs() < f64::EPSILON);
assert!((claim.threshold - 25.0).abs() < f64::EPSILON);
assert!((claim.measured - 30.0).abs() < f64::EPSILON);
assert!(claim.verified);
let failed_claim =
FalsifiableClaim::new("TEST-002", "Failing claim", 5.0, 50.0).evaluate(30.0);
assert!(!failed_claim.verified); }
#[test]
fn test_imp800d_gpu_faster_than_cpu() {
let faster = GpuParityResult::new(30.0, 240.0, 0.03, "GPU", 8192);
assert!(faster.gpu_faster_than_cpu()); assert!((faster.cpu_speedup() - 6.0).abs() < f64::EPSILON);
let slower = GpuParityResult::new(4.0, 240.0, 0.03, "GPU", 8192);
assert!(!slower.gpu_faster_than_cpu()); }
#[test]
fn test_imp900a_optimized_gemm_config_default() {
let config = OptimizedGemmConfig::default();
assert_eq!(config.tile_size, 32);
assert_eq!(config.reg_block, 4);
assert!(!config.use_tensor_cores);
assert_eq!(config.vector_width, 4);
assert_eq!(config.k_unroll, 4);
assert!(config.double_buffer);
}
#[test]
fn test_imp900a_shared_memory_calculation() {
let config = OptimizedGemmConfig::default();
assert_eq!(config.shared_memory_bytes(), 32 * 32 * 4 * 4);
let no_double = OptimizedGemmConfig {
double_buffer: false,
..Default::default()
};
assert_eq!(no_double.shared_memory_bytes(), 32 * 32 * 4 * 2);
}
#[test]
fn test_imp900a_threads_per_block() {
let config = OptimizedGemmConfig::default();
assert_eq!(config.threads_per_block(), 64);
let large = OptimizedGemmConfig::large();
assert_eq!(large.threads_per_block(), 64);
}
#[test]
fn test_imp900a_registers_per_thread() {
let config = OptimizedGemmConfig::default();
assert_eq!(config.registers_per_thread(), 16);
let large = OptimizedGemmConfig::large();
assert_eq!(large.registers_per_thread(), 64);
}
#[test]
fn test_imp900a_gemm_performance_result() {
let result = GemmPerformanceResult::new(1024, 1024, 1024, 1.54);
assert!((result.gflops - 1394.5).abs() < 10.0);
let with_peak = result.with_peak(82000.0);
assert!(with_peak.efficiency < 2.0); }
#[test]
fn test_imp900a_performance_improvement_check() {
let result = GemmPerformanceResult::new(1024, 1024, 1024, 0.70);
let baseline_gflops = 1396.0;
assert!(result.improved_by(baseline_gflops, 2.0));
assert!(!result.improved_by(baseline_gflops, 3.0)); }
#[test]
fn test_imp900a_expected_improvement() {
let benchmark = OptimizedGemmBenchmark::default();
let expected = benchmark.expected_improvement();
assert!((expected - 4.68).abs() < 0.1);
}
#[test]
fn test_imp900b_fused_op_spec() {
let spec = FusedOpSpec {
op_type: FusedOpType::GemmBiasActivation,
input_dims: vec![256, 2560],
output_dims: vec![256, 10240],
activation: Some("gelu".to_string()),
fused_launches: 1,
unfused_launches: 3,
};
assert_eq!(spec.launch_reduction(), 3.0);
assert!(spec.achieves_target_reduction()); }
#[test]
fn test_imp900b_launch_reduction_targets() {
let good = FusedOpSpec {
op_type: FusedOpType::FusedAttention,
input_dims: vec![1, 32, 512, 80],
output_dims: vec![1, 32, 512, 80],
activation: None,
fused_launches: 1,
unfused_launches: 4,
};
assert!(good.achieves_target_reduction());
let marginal = FusedOpSpec {
op_type: FusedOpType::LayerNormLinear,
input_dims: vec![256, 2560],
output_dims: vec![256, 2560],
activation: None,
fused_launches: 1,
unfused_launches: 2,
};
assert!(marginal.achieves_target_reduction());
let poor = FusedOpSpec {
op_type: FusedOpType::FusedFfn,
input_dims: vec![256, 2560],
output_dims: vec![256, 2560],
activation: None,
fused_launches: 2,
unfused_launches: 3,
};
assert!(!poor.achieves_target_reduction());
}
#[test]
fn test_imp900c_flash_attention_phi2_config() {
let config = FlashAttentionConfig::phi2();
assert_eq!(config.head_dim, 80);
assert_eq!(config.num_heads, 32);
assert!(config.causal);
assert!((config.scale - 0.1118).abs() < 0.001);
}
#[test]
fn test_imp900c_memory_comparison() {
let config = FlashAttentionConfig::phi2();
let (naive_512, flash_512) = config.memory_comparison(512);
assert_eq!(naive_512, 512 * 512 * 4); assert_eq!(flash_512, 64 * 64 * 4 * 2);
let (naive_2048, flash_2048) = config.memory_comparison(2048);
assert_eq!(naive_2048, 2048 * 2048 * 4); assert_eq!(flash_2048, 64 * 64 * 4 * 2); }
#[test]
fn test_imp900c_memory_savings() {
let config = FlashAttentionConfig::phi2();
let savings_512 = config.memory_savings(512);
assert!((savings_512 - 32.0).abs() < 1.0);
let savings_2048 = config.memory_savings(2048);
assert!((savings_2048 - 512.0).abs() < 10.0);
}
#[test]
fn test_imp900d_memory_pool_default() {
let config = MemoryPoolConfig::default();
assert_eq!(config.initial_size, 256 * 1024 * 1024); assert_eq!(config.max_size, 2 * 1024 * 1024 * 1024); assert!(config.use_pinned_memory);
assert!(config.async_transfers);
assert_eq!(config.size_classes.len(), 9);
}
#[test]
fn test_imp900d_size_class_lookup() {
let config = MemoryPoolConfig::default();
assert_eq!(config.find_size_class(1024), Some(4096));
assert_eq!(config.find_size_class(500_000), Some(1048576));
assert_eq!(config.find_size_class(200_000_000), Some(268435456));
assert_eq!(config.find_size_class(500_000_000), None);
}
#[test]
fn test_imp900d_bandwidth_improvement() {
let pinned = MemoryPoolConfig::default();
assert!((pinned.expected_bandwidth_improvement() - 2.4).abs() < 0.1);
let unpinned = MemoryPoolConfig {
use_pinned_memory: false,
..Default::default()
};
assert!((unpinned.expected_bandwidth_improvement() - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_imp900_combined_result_baseline() {
let result = Imp900Result::from_baseline(13.1);
assert!((result.baseline_tps - 13.1).abs() < 0.1);
assert!((result.optimized_tps - 13.1).abs() < 0.1);
assert!((result.gap_ratio - 18.32).abs() < 0.1); assert!(result.milestone.is_none()); }
#[test]
fn test_imp900_individual_optimizations() {
let result = Imp900Result::from_baseline(13.1).with_gemm_improvement(2.5);
assert!((result.optimized_tps - 32.75).abs() < 0.1); assert!((result.gap_ratio - 7.33).abs() < 0.1);
assert!(result.milestone.is_none());
}
#[test]
fn test_imp900_m3_achievement() {
let result = Imp900Result::from_baseline(13.1)
.with_gemm_improvement(2.5)
.with_memory_improvement(1.5);
assert!((result.optimized_tps - 49.125).abs() < 0.1);
assert!((result.gap_ratio - 4.89).abs() < 0.1);
assert!(result.achieves_m3()); assert_eq!(result.milestone, Some("M2".to_string())); }
#[test]
fn test_imp900_m4_achievement() {
let result = Imp900Result::from_baseline(13.1)
.with_gemm_improvement(3.0)
.with_fusion_improvement(2.0)
.with_flash_attention_improvement(2.5)
.with_memory_improvement(1.5);
let expected_tps = 13.1 * 3.0 * 2.0 * 2.5 * 1.5;
assert!((result.optimized_tps - expected_tps).abs() < 0.1);
assert!((result.gap_ratio - 0.81).abs() < 0.1);
assert!(result.achieves_m4()); assert_eq!(result.milestone, Some("M4".to_string()));
}
#[test]
fn test_imp900_total_improvement() {
let result = Imp900Result::from_baseline(13.1)
.with_gemm_improvement(2.0)
.with_fusion_improvement(1.5)
.with_flash_attention_improvement(2.0)
.with_memory_improvement(1.5);
assert!((result.total_improvement() - 9.0).abs() < 0.1);
}
}