#[derive(Debug, Clone, Default)]
pub struct PerfMetrics {
pub t_load_ms: u64,
pub t_p_eval_ms: u64,
pub t_eval_ms: u64,
pub n_p_eval: u32,
pub n_eval: u32,
pub n_samples: u32,
}
impl PerfMetrics {
pub fn new() -> Self {
Self::default()
}
pub fn record_load(&mut self, ms: u64) {
self.t_load_ms = ms;
}
pub fn record_prefill(&mut self, ms: u64, tokens: u32) {
self.t_p_eval_ms = ms;
self.n_p_eval = tokens;
}
pub fn record_decode(&mut self, ms: u64) {
self.t_eval_ms += ms;
self.n_eval += 1;
self.n_samples += 1;
}
pub fn record_decode_batch(&mut self, ms: u64, tokens: u32) {
self.t_eval_ms += ms;
self.n_eval += tokens;
self.n_samples += 1;
}
#[must_use]
pub fn tokens_per_second(&self) -> f64 {
if self.t_eval_ms == 0 {
0.0
} else {
1000.0 * self.n_eval as f64 / self.t_eval_ms as f64
}
}
#[must_use]
pub fn prefill_tokens_per_second(&self) -> f64 {
if self.t_p_eval_ms == 0 {
0.0
} else {
1000.0 * self.n_p_eval as f64 / self.t_p_eval_ms as f64
}
}
#[must_use]
pub fn total_ms(&self) -> u64 {
self.t_load_ms + self.t_p_eval_ms + self.t_eval_ms
}
#[must_use]
pub fn time_to_first_token_ms(&self) -> u64 {
self.t_load_ms + self.t_p_eval_ms
}
#[must_use]
pub fn avg_token_latency_ms(&self) -> f64 {
if self.n_eval == 0 {
0.0
} else {
self.t_eval_ms as f64 / self.n_eval as f64
}
}
#[must_use]
pub fn summary(&self) -> String {
format!(
"load: {}ms, prefill: {}ms ({:.1} tok/s, {} tokens), decode: {}ms ({:.1} tok/s, {} tokens), total: {}ms",
self.t_load_ms,
self.t_p_eval_ms,
self.prefill_tokens_per_second(),
self.n_p_eval,
self.t_eval_ms,
self.tokens_per_second(),
self.n_eval,
self.total_ms()
)
}
pub fn reset(&mut self) {
*self = Self::default();
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum InferencePhase {
#[default]
Prefill,
Decode,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_perf_metrics_default() {
let metrics = PerfMetrics::default();
assert_eq!(metrics.t_load_ms, 0);
assert_eq!(metrics.t_p_eval_ms, 0);
assert_eq!(metrics.t_eval_ms, 0);
assert_eq!(metrics.n_p_eval, 0);
assert_eq!(metrics.n_eval, 0);
}
#[test]
fn test_perf_metrics_record_load() {
let mut metrics = PerfMetrics::new();
metrics.record_load(1500);
assert_eq!(metrics.t_load_ms, 1500);
}
#[test]
fn test_perf_metrics_record_prefill() {
let mut metrics = PerfMetrics::new();
metrics.record_prefill(200, 512);
assert_eq!(metrics.t_p_eval_ms, 200);
assert_eq!(metrics.n_p_eval, 512);
}
#[test]
fn test_perf_metrics_record_decode() {
let mut metrics = PerfMetrics::new();
metrics.record_decode(50);
metrics.record_decode(50);
assert_eq!(metrics.t_eval_ms, 100);
assert_eq!(metrics.n_eval, 2);
assert_eq!(metrics.n_samples, 2);
}
#[test]
fn test_perf_metrics_tokens_per_second() {
let mut metrics = PerfMetrics::new();
metrics.record_decode_batch(1000, 100); assert!((metrics.tokens_per_second() - 100.0).abs() < 0.001);
}
#[test]
fn test_perf_metrics_prefill_throughput() {
let mut metrics = PerfMetrics::new();
metrics.record_prefill(500, 1000); assert!((metrics.prefill_tokens_per_second() - 2000.0).abs() < 0.001);
}
#[test]
fn test_perf_metrics_total_ms() {
let mut metrics = PerfMetrics::new();
metrics.record_load(1000);
metrics.record_prefill(200, 512);
metrics.record_decode_batch(300, 100);
assert_eq!(metrics.total_ms(), 1500);
}
#[test]
fn test_perf_metrics_time_to_first_token() {
let mut metrics = PerfMetrics::new();
metrics.record_load(1000);
metrics.record_prefill(200, 512);
assert_eq!(metrics.time_to_first_token_ms(), 1200);
}
#[test]
fn test_perf_metrics_reset() {
let mut metrics = PerfMetrics::new();
metrics.record_load(1500);
metrics.record_prefill(200, 512);
metrics.reset();
assert_eq!(metrics.t_load_ms, 0);
assert_eq!(metrics.n_p_eval, 0);
}
#[test]
fn test_inference_phase_default() {
let phase = InferencePhase::default();
assert_eq!(phase, InferencePhase::Prefill);
}
#[test]
fn test_inference_phase_eq() {
assert_eq!(InferencePhase::Prefill, InferencePhase::Prefill);
assert_eq!(InferencePhase::Decode, InferencePhase::Decode);
assert_ne!(InferencePhase::Prefill, InferencePhase::Decode);
}
}