impl ItlMetrics {
#[must_use]
pub fn from_measurements(itl_times_ms: &[f64]) -> Self {
if itl_times_ms.is_empty() {
return Self::default();
}
let mut sorted = itl_times_ms.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
let median_ms = if n.is_multiple_of(2) {
f64::midpoint(sorted[n / 2 - 1], sorted[n / 2])
} else {
sorted[n / 2]
};
let mean = itl_times_ms.iter().sum::<f64>() / n as f64;
let variance = itl_times_ms.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
/ (n as f64 - 1.0).max(1.0);
let std_dev_ms = variance.sqrt();
let percentile_99 = ((n as f64 * 0.99).ceil() as usize)
.saturating_sub(1)
.min(n - 1);
let percentile_999 = ((n as f64 * 0.999).ceil() as usize)
.saturating_sub(1)
.min(n - 1);
Self {
median_ms,
std_dev_ms,
p99_ms: sorted[percentile_99],
p999_ms: sorted[percentile_999],
}
}
#[must_use]
pub fn is_low_jitter(&self, threshold_ms: f64) -> bool {
self.std_dev_ms < threshold_ms
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum QualityResult {
Pass {
kl_divergence: f64,
},
Fail {
kl_divergence: f64,
threshold: f64,
message: &'static str,
},
}
fn softmax(logits: &[f32]) -> Vec<f64> {
let max_logit = logits.iter().copied().fold(f32::NEG_INFINITY, f32::max);
let exp_logits: Vec<f64> = logits
.iter()
.map(|x| ((*x - max_logit) as f64).exp())
.collect();
let sum: f64 = exp_logits.iter().sum();
exp_logits.iter().map(|x| x / sum).collect()
}
#[must_use]
pub fn validate_quantization_quality(
fp32_logits: &[f32],
quantized_logits: &[f32],
threshold: f64,
) -> QualityResult {
if fp32_logits.len() != quantized_logits.len() {
return QualityResult::Fail {
kl_divergence: f64::INFINITY,
threshold,
message: "Logit vector lengths do not match",
};
}
if fp32_logits.is_empty() {
return QualityResult::Pass { kl_divergence: 0.0 };
}
let fp32_probs = softmax(fp32_logits);
let quant_probs = softmax(quantized_logits);
let kl_div: f64 = fp32_probs
.iter()
.zip(&quant_probs)
.map(|(p, q)| {
if *p > 1e-10 && *q > 1e-10 {
p * (p / q).ln()
} else {
0.0
}
})
.sum();
if kl_div < threshold {
QualityResult::Pass {
kl_divergence: kl_div,
}
} else {
QualityResult::Fail {
kl_divergence: kl_div,
threshold,
message: "Quantization quality degradation detected",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkConfig {
pub model: String,
pub format: String,
pub quantization: String,
pub runtime: String,
pub runtime_version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkResult {
pub config: BenchmarkConfig,
pub cold_start_ms: f64,
pub model_load_ms: f64,
pub ttft_ms: Vec<f64>,
pub itl_ms: Vec<f64>,
pub generation_tok_s: Vec<f64>,
pub peak_memory_mb: u64,
pub kv_cache_waste_pct: f64,
pub energy_joules: f64,
pub tokens_generated: u64,
pub actual_iterations: usize,
pub cv_at_stop: f64,
pub timestamp: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSummary {
pub ttft_p50: f64,
pub ttft_p95: f64,
pub ttft_p99: f64,
pub ttft_p999: f64,
pub itl_median: f64,
pub itl_std_dev: f64,
pub throughput_median: f64,
pub throughput_ci_95: (f64, f64),
pub token_joules: f64,
pub memory_waste_pct: f64,
pub iterations: usize,
pub cv_final: f64,
}
impl BenchmarkResult {
#[must_use]
pub fn summary(&self) -> BenchmarkSummary {
BenchmarkSummary {
ttft_p50: percentile(&self.ttft_ms, 50.0),
ttft_p95: percentile(&self.ttft_ms, 95.0),
ttft_p99: percentile(&self.ttft_ms, 99.0),
ttft_p999: percentile(&self.ttft_ms, 99.9),
itl_median: percentile(&self.itl_ms, 50.0),
itl_std_dev: compute_std_dev(&self.itl_ms),
throughput_median: percentile(&self.generation_tok_s, 50.0),
throughput_ci_95: bootstrap_ci(&self.generation_tok_s, 0.95, 1000),
token_joules: if self.tokens_generated > 0 {
self.energy_joules / self.tokens_generated as f64
} else {
0.0
},
memory_waste_pct: self.kv_cache_waste_pct,
iterations: self.actual_iterations,
cv_final: self.cv_at_stop,
}
}
}
fn percentile(data: &[f64], p: f64) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut sorted = data.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let idx = ((sorted.len() as f64 * p / 100.0).ceil() as usize)
.saturating_sub(1)
.min(sorted.len() - 1);
sorted[idx]
}
fn compute_std_dev(data: &[f64]) -> f64 {
compute_variance(data).sqrt()
}
fn bootstrap_ci(data: &[f64], confidence: f64, n_resamples: usize) -> (f64, f64) {
if data.is_empty() {
return (0.0, 0.0);
}
let mut bootstrap_means = Vec::with_capacity(n_resamples);
let n = data.len();
for i in 0..n_resamples {
let seed = (i as u64)
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1);
let mut sum = 0.0;
for j in 0..n {
let idx = ((seed.wrapping_mul(j as u64 + 1)) as usize) % n;
sum += data[idx];
}
bootstrap_means.push(sum / n as f64);
}
bootstrap_means.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let alpha = 1.0 - confidence;
let lower_idx = ((n_resamples as f64 * alpha / 2.0).floor() as usize).min(n_resamples - 1);
let upper_idx =
((n_resamples as f64 * (1.0 - alpha / 2.0)).ceil() as usize).min(n_resamples - 1);
(bootstrap_means[lower_idx], bootstrap_means[upper_idx])
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WorkloadType {
ShortQa,
LongContext,
}
impl WorkloadType {
#[must_use]
pub const fn input_tokens(&self) -> usize {
match self {
Self::ShortQa => 32,
Self::LongContext => 2048,
}
}
#[must_use]
pub const fn output_tokens(&self) -> usize {
match self {
Self::ShortQa => 64,
Self::LongContext => 512,
}
}
}
#[derive(Debug, Clone)]
pub struct ConvoyTestConfig {
pub long_requests: usize,
pub short_requests: usize,
pub max_p99_increase_pct: f64,
pub max_hol_blocking_ms: f64,
pub max_kv_fragmentation_pct: f64,
}
impl Default for ConvoyTestConfig {
fn default() -> Self {
Self {
long_requests: 10,
short_requests: 100,
max_p99_increase_pct: 50.0,
max_hol_blocking_ms: 500.0,
max_kv_fragmentation_pct: 15.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConvoyRequestResult {
pub workload_type: String,
pub queue_time_ms: f64,
pub ttft_ms: f64,
pub total_latency_ms: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConvoyTestResult {
pub long_requests: usize,
pub short_requests: usize,
pub baseline_short_p99_ms: f64,
pub convoy_short_p99_ms: f64,
pub p99_increase_pct: f64,
pub max_hol_blocking_ms: f64,
pub avg_hol_blocking_ms: f64,
pub kv_fragmentation_pct: f64,
pub passed: bool,
pub failure_reasons: Vec<String>,
}