oxillama_runtime/
metrics.rs1use std::sync::atomic::{AtomicU64, Ordering};
4use std::sync::Arc;
5use std::time::Duration;
6
7#[derive(Debug, Default)]
9pub struct EngineMetrics {
10 pub tokens_generated: AtomicU64,
12 pub tokens_prefilled: AtomicU64,
14 pub kv_cache_hits: AtomicU64,
16 pub kv_cache_misses: AtomicU64,
18 pub decode_nanos: AtomicU64,
20 pub prefill_nanos: AtomicU64,
22 pub requests_started: AtomicU64,
24 pub requests_completed: AtomicU64,
26}
27
28impl EngineMetrics {
29 pub fn new() -> Arc<Self> {
31 Arc::new(Self::default())
32 }
33
34 pub fn record_decode_token(&self, elapsed: Duration) {
36 self.tokens_generated.fetch_add(1, Ordering::Relaxed);
37 self.decode_nanos
38 .fetch_add(elapsed.as_nanos() as u64, Ordering::Relaxed);
39 }
40
41 pub fn record_prefill(&self, n_tokens: u64, elapsed: Duration) {
43 self.tokens_prefilled.fetch_add(n_tokens, Ordering::Relaxed);
44 self.prefill_nanos
45 .fetch_add(elapsed.as_nanos() as u64, Ordering::Relaxed);
46 }
47
48 pub fn record_kv_hit(&self) {
50 self.kv_cache_hits.fetch_add(1, Ordering::Relaxed);
51 }
52
53 pub fn record_kv_miss(&self) {
55 self.kv_cache_misses.fetch_add(1, Ordering::Relaxed);
56 }
57
58 pub fn record_request_start(&self) {
60 self.requests_started.fetch_add(1, Ordering::Relaxed);
61 }
62
63 pub fn record_request_complete(&self) {
65 self.requests_completed.fetch_add(1, Ordering::Relaxed);
66 }
67
68 pub fn throughput(&self) -> (f64, f64) {
72 let decode_tokens = self.tokens_generated.load(Ordering::Relaxed);
73 let decode_nanos = self.decode_nanos.load(Ordering::Relaxed);
74 let prefill_tokens = self.tokens_prefilled.load(Ordering::Relaxed);
75 let prefill_nanos = self.prefill_nanos.load(Ordering::Relaxed);
76
77 let decode_tps = if decode_nanos == 0 {
78 0.0_f64
79 } else {
80 decode_tokens as f64 / (decode_nanos as f64 * 1e-9)
81 };
82
83 let prefill_tps = if prefill_nanos == 0 {
84 0.0_f64
85 } else {
86 prefill_tokens as f64 / (prefill_nanos as f64 * 1e-9)
87 };
88
89 (decode_tps, prefill_tps)
90 }
91
92 pub fn kv_cache_hit_rate(&self) -> f64 {
96 let hits = self.kv_cache_hits.load(Ordering::Relaxed);
97 let misses = self.kv_cache_misses.load(Ordering::Relaxed);
98 let total = hits + misses;
99 if total == 0 {
100 0.0
101 } else {
102 hits as f64 / total as f64
103 }
104 }
105
106 pub fn snapshot(&self) -> MetricsSnapshot {
108 let (decode_tps, prefill_tps) = self.throughput();
109 MetricsSnapshot {
110 tokens_generated: self.tokens_generated.load(Ordering::Relaxed),
111 tokens_prefilled: self.tokens_prefilled.load(Ordering::Relaxed),
112 decode_tokens_per_sec: decode_tps,
113 prefill_tokens_per_sec: prefill_tps,
114 kv_cache_hit_rate: self.kv_cache_hit_rate(),
115 requests_started: self.requests_started.load(Ordering::Relaxed),
116 requests_completed: self.requests_completed.load(Ordering::Relaxed),
117 }
118 }
119
120 pub fn reset(&self) {
122 self.tokens_generated.store(0, Ordering::Relaxed);
123 self.tokens_prefilled.store(0, Ordering::Relaxed);
124 self.kv_cache_hits.store(0, Ordering::Relaxed);
125 self.kv_cache_misses.store(0, Ordering::Relaxed);
126 self.decode_nanos.store(0, Ordering::Relaxed);
127 self.prefill_nanos.store(0, Ordering::Relaxed);
128 self.requests_started.store(0, Ordering::Relaxed);
129 self.requests_completed.store(0, Ordering::Relaxed);
130 }
131}
132
133#[derive(Debug, Clone)]
135pub struct MetricsSnapshot {
136 pub tokens_generated: u64,
138 pub tokens_prefilled: u64,
140 pub decode_tokens_per_sec: f64,
142 pub prefill_tokens_per_sec: f64,
144 pub kv_cache_hit_rate: f64,
146 pub requests_started: u64,
148 pub requests_completed: u64,
150}
151
152#[cfg(test)]
153mod tests {
154 use super::*;
155
156 #[test]
157 fn test_record_decode_token_increments() {
158 let m = EngineMetrics::new();
159 m.record_decode_token(Duration::from_secs(1));
160 m.record_decode_token(Duration::from_secs(1));
161 assert_eq!(m.tokens_generated.load(Ordering::Relaxed), 2);
162 assert_eq!(m.decode_nanos.load(Ordering::Relaxed), 2_000_000_000);
163 }
164
165 #[test]
166 fn test_throughput_decode() {
167 let m = EngineMetrics::new();
168 for _ in 0..10 {
170 m.record_decode_token(Duration::from_millis(100));
171 }
172 let (decode_tps, prefill_tps) = m.throughput();
173 assert!((decode_tps - 10.0).abs() < 0.1, "decode_tps={decode_tps}");
174 assert_eq!(prefill_tps, 0.0);
175 }
176
177 #[test]
178 fn test_snapshot_fields() {
179 let m = EngineMetrics::new();
180 m.record_prefill(5, Duration::from_millis(50));
181 m.record_kv_hit();
182 m.record_kv_miss();
183 m.record_request_start();
184 m.record_request_complete();
185
186 let snap = m.snapshot();
187 assert_eq!(snap.tokens_prefilled, 5);
188 assert_eq!(snap.requests_started, 1);
189 assert_eq!(snap.requests_completed, 1);
190 assert!((snap.kv_cache_hit_rate - 0.5).abs() < 1e-9);
191 }
192
193 #[test]
194 fn test_reset_clears_all() {
195 let m = EngineMetrics::new();
196 m.record_decode_token(Duration::from_millis(10));
197 m.record_prefill(3, Duration::from_millis(5));
198 m.record_kv_hit();
199 m.reset();
200 let snap = m.snapshot();
201 assert_eq!(snap.tokens_generated, 0);
202 assert_eq!(snap.tokens_prefilled, 0);
203 assert_eq!(snap.decode_tokens_per_sec, 0.0);
204 assert_eq!(snap.kv_cache_hit_rate, 0.0);
205 }
206
207 #[test]
208 fn test_kv_cache_hit_rate_zero_when_no_lookups() {
209 let m = EngineMetrics::new();
210 assert_eq!(m.kv_cache_hit_rate(), 0.0);
211 }
212
213 #[test]
214 fn test_kv_cache_hit_rate_all_hits() {
215 let m = EngineMetrics::new();
216 m.record_kv_hit();
217 m.record_kv_hit();
218 assert!((m.kv_cache_hit_rate() - 1.0).abs() < 1e-9);
219 }
220}