use crate::error::Result;
use super::{AprTransformer, GenerateConfig};
pub const APR_CPU_DECODE_THRESHOLD_TOK_S: f64 = 50.0;
pub const APR_PREFILL_THRESHOLD_TOK_S: f64 = 100.0;
pub const APR_PARITY_THRESHOLD_PCT: f64 = 95.0;
#[derive(Debug, Clone, Default)]
pub struct AprBenchmarkResult {
pub tokens_generated: usize,
pub total_time_ms: f64,
pub tokens_per_second: f64,
pub throughput_p50: f64,
pub throughput_p99: f64,
pub throughput_std_dev: f64,
pub peak_memory_mb: f64,
pub model_memory_mb: f64,
}
impl AprBenchmarkResult {
#[must_use]
pub fn meets_threshold(&self, threshold_tok_s: f64) -> bool {
self.tokens_per_second >= threshold_tok_s
}
#[must_use]
pub fn compare_to_baseline(&self, baseline: &AprBenchmarkResult) -> AprParityComparison {
let throughput_ratio = if baseline.tokens_per_second > 0.0 {
self.tokens_per_second / baseline.tokens_per_second
} else {
1.0
};
let memory_ratio = if baseline.peak_memory_mb > 0.0 {
self.peak_memory_mb / baseline.peak_memory_mb
} else {
1.0
};
AprParityComparison {
throughput_ratio,
memory_ratio,
parity_threshold_pct: APR_PARITY_THRESHOLD_PCT,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct AprPrefillResult {
pub prompt_tokens: usize,
pub prefill_time_ms: f64,
pub prefill_tok_s: f64,
}
#[derive(Debug, Clone, Default)]
pub struct AprLoadResult {
pub load_time_ms: f64,
}
#[derive(Debug, Clone)]
pub struct AprParityComparison {
pub throughput_ratio: f64,
pub memory_ratio: f64,
pub parity_threshold_pct: f64,
}
impl AprParityComparison {
#[must_use]
pub fn is_parity(&self) -> bool {
self.throughput_ratio >= (self.parity_threshold_pct / 100.0)
}
}
#[derive(Debug)]
pub struct AprBenchmarkRunner {
transformer: AprTransformer,
warmup_iterations: usize,
measure_iterations: usize,
}
impl AprBenchmarkRunner {
#[must_use]
pub fn new(transformer: AprTransformer) -> Self {
Self {
transformer,
warmup_iterations: 3,
measure_iterations: 10,
}
}
#[must_use]
pub fn warmup_iterations(&self) -> usize {
self.warmup_iterations
}
#[must_use]
pub fn measure_iterations(&self) -> usize {
self.measure_iterations
}
pub fn set_warmup_iterations(&mut self, n: usize) {
self.warmup_iterations = n;
}
pub fn set_measure_iterations(&mut self, n: usize) {
self.measure_iterations = n.max(1);
}
pub fn benchmark_decode(
&mut self,
prompt: &[u32],
num_tokens: usize,
) -> Result<AprBenchmarkResult> {
use std::time::Instant;
for _ in 0..self.warmup_iterations {
let gen_config = GenerateConfig {
max_tokens: num_tokens.min(5),
temperature: 0.0,
..Default::default()
};
let _ = self.transformer.generate_with_cache(prompt, &gen_config)?;
}
let mut throughputs = Vec::with_capacity(self.measure_iterations);
let mut total_tokens = 0usize;
let mut total_time_ms = 0.0f64;
for _ in 0..self.measure_iterations {
let gen_config = GenerateConfig {
max_tokens: num_tokens,
temperature: 0.0,
..Default::default()
};
let start = Instant::now();
let output = self.transformer.generate_with_cache(prompt, &gen_config)?;
let elapsed = start.elapsed();
let generated = output.len().saturating_sub(prompt.len());
let time_ms = elapsed.as_secs_f64() * 1000.0;
let throughput = if time_ms > 0.0 {
(generated as f64) / (time_ms / 1000.0)
} else {
0.0
};
throughputs.push(throughput);
total_tokens += generated;
total_time_ms += time_ms;
}
let mean_throughput = if !throughputs.is_empty() {
throughputs.iter().sum::<f64>() / throughputs.len() as f64
} else {
0.0
};
let mut sorted = throughputs.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let p50 = if !sorted.is_empty() {
sorted[sorted.len() / 2]
} else {
0.0
};
let p99_idx =
((sorted.len() as f64 * 0.01).floor() as usize).min(sorted.len().saturating_sub(1));
let p99 = if !sorted.is_empty() {
sorted[p99_idx]
} else {
0.0
};
let std_dev = if throughputs.len() > 1 {
let variance = throughputs
.iter()
.map(|t| (t - mean_throughput).powi(2))
.sum::<f64>()
/ (throughputs.len() - 1) as f64;
variance.sqrt()
} else {
0.0
};
let model_memory_mb = (self.transformer.memory_size() as f64) / (1024.0 * 1024.0);
Ok(AprBenchmarkResult {
tokens_generated: total_tokens / self.measure_iterations.max(1),
total_time_ms: total_time_ms / self.measure_iterations.max(1) as f64,
tokens_per_second: mean_throughput,
throughput_p50: p50,
throughput_p99: p99,
throughput_std_dev: std_dev,
peak_memory_mb: model_memory_mb * 1.5, model_memory_mb,
})
}
pub fn benchmark_prefill(&mut self, prompt: &[u32]) -> Result<AprPrefillResult> {
use std::time::Instant;
for _ in 0..self.warmup_iterations {
let _ = self.transformer.forward(prompt)?;
}
let mut prefill_times_ms = Vec::with_capacity(self.measure_iterations);
for _ in 0..self.measure_iterations {
let start = Instant::now();
let _ = self.transformer.forward(prompt)?;
let elapsed = start.elapsed();
prefill_times_ms.push(elapsed.as_secs_f64() * 1000.0);
}
let mean_time_ms = if !prefill_times_ms.is_empty() {
prefill_times_ms.iter().sum::<f64>() / prefill_times_ms.len() as f64
} else {
0.0
};
let prefill_tok_s = if mean_time_ms > 0.0 {
(prompt.len() as f64) / (mean_time_ms / 1000.0)
} else {
0.0
};
Ok(AprPrefillResult {
prompt_tokens: prompt.len(),
prefill_time_ms: mean_time_ms,
prefill_tok_s,
})
}
pub fn benchmark_load<F>(loader: F) -> Result<AprLoadResult>
where
F: Fn() -> AprTransformer,
{
use std::time::Instant;
let start = Instant::now();
let _transformer = loader();
let elapsed = start.elapsed();
Ok(AprLoadResult {
load_time_ms: elapsed.as_secs_f64() * 1000.0,
})
}
}
include!("benchmark_apr_cpu_prefill.rs");