use std::time::Instant;
use crate::runtime::config::KvReuseMode;
use crate::runtime::numeric::duration_ms;
use crate::runtime::request::GenerateRequest;
#[repr(i32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum CacheSource {
#[default]
None = 0,
Live = 1,
Snapshot = 2,
}
#[repr(i32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum CacheMetricMode {
Disabled = 0,
#[default]
LiveSlotPrefix = 1,
StateSnapshot = 2,
LiveSlotAndSnapshot = 3,
}
impl From<KvReuseMode> for CacheMetricMode {
fn from(mode: KvReuseMode) -> Self {
match mode {
KvReuseMode::Disabled => Self::Disabled,
KvReuseMode::LiveSlotPrefix => Self::LiveSlotPrefix,
KvReuseMode::StateSnapshot => Self::StateSnapshot,
KvReuseMode::LiveSlotAndSnapshot => Self::LiveSlotAndSnapshot,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Default)]
pub struct RuntimeObservabilityMetrics {
pub ttft_ms: f64,
pub itl_avg_ms: f64,
pub itl_p99_ms: f64,
pub e2e_ms: f64,
pub prefill_ms: f64,
pub decode_ms: f64,
pub native_gpu_ms: f64,
pub native_sync_ms: f64,
pub native_logic_ms: f64,
pub input_tokens: i32,
pub output_tokens: i32,
pub cache_mode: CacheMetricMode,
pub cache_source: CacheSource,
pub cache_hits: i32,
pub prefill_tokens: i32,
}
impl RuntimeObservabilityMetrics {
pub(crate) fn average_inter_token_ms(output_tokens: i32, decode_ms: f64) -> f64 {
if output_tokens > 1 {
decode_ms / f64::from(output_tokens - 1)
} else {
0.0
}
}
pub(crate) fn ttft_ms_from_request(request: &GenerateRequest) -> f64 {
request
.first_token_at
.and_then(|first_token_at| {
request
.enqueued_at
.map(|enqueued| duration_ms(enqueued, first_token_at))
})
.unwrap_or(0.0)
}
pub(crate) fn e2e_ms_from_request(request: &GenerateRequest, completed_at: Instant) -> f64 {
request
.enqueued_at
.map(|enqueued| duration_ms(enqueued, completed_at))
.unwrap_or(0.0)
}
pub(crate) fn from_request(request: &GenerateRequest) -> Self {
Self {
ttft_ms: Self::ttft_ms_from_request(request),
itl_avg_ms: Self::average_inter_token_ms(request.output_tokens, request.decode_ms),
prefill_ms: request.prefill_ms,
decode_ms: request.decode_ms,
native_gpu_ms: request.native_gpu_ms,
native_sync_ms: request.native_sync_ms,
native_logic_ms: request.native_logic_ms,
input_tokens: request.input_tokens,
output_tokens: request.output_tokens,
cache_mode: request.cache_mode.into(),
cache_source: request.cache_source,
cache_hits: request.cache_hits,
prefill_tokens: request.prefill_tokens,
..Self::default()
}
}
}
#[cfg(test)]
#[path = "../../tests/runtime/metrics_tests.rs"]
mod metrics_tests;