Skip to main content

oxillama_runtime/
metrics.rs

1//! Engine metrics — thread-safe counters for throughput and cache statistics.
2
3use std::sync::atomic::{AtomicU64, Ordering};
4use std::sync::Arc;
5use std::time::Duration;
6
7/// Thread-safe metrics counters for an inference engine.
8#[derive(Debug, Default)]
9pub struct EngineMetrics {
10    /// Total tokens generated (decode phase).
11    pub tokens_generated: AtomicU64,
12    /// Total tokens processed in prefill.
13    pub tokens_prefilled: AtomicU64,
14    /// Total KV cache hits (prefix/page cache returns a cached slot).
15    pub kv_cache_hits: AtomicU64,
16    /// Total KV cache misses.
17    pub kv_cache_misses: AtomicU64,
18    /// Total decode time in nanoseconds.
19    pub decode_nanos: AtomicU64,
20    /// Total prefill time in nanoseconds.
21    pub prefill_nanos: AtomicU64,
22    /// Number of requests started.
23    pub requests_started: AtomicU64,
24    /// Number of requests completed.
25    pub requests_completed: AtomicU64,
26}
27
28impl EngineMetrics {
29    /// Create an `Arc`-wrapped, zero-initialised metrics instance.
30    pub fn new() -> Arc<Self> {
31        Arc::new(Self::default())
32    }
33
34    /// Record a single decode token and the time taken to produce it.
35    pub fn record_decode_token(&self, elapsed: Duration) {
36        self.tokens_generated.fetch_add(1, Ordering::Relaxed);
37        self.decode_nanos
38            .fetch_add(elapsed.as_nanos() as u64, Ordering::Relaxed);
39    }
40
41    /// Record a prefill phase that processed `n_tokens` prompt tokens.
42    pub fn record_prefill(&self, n_tokens: u64, elapsed: Duration) {
43        self.tokens_prefilled.fetch_add(n_tokens, Ordering::Relaxed);
44        self.prefill_nanos
45            .fetch_add(elapsed.as_nanos() as u64, Ordering::Relaxed);
46    }
47
48    /// Record a KV-cache hit (prefix or page reuse).
49    pub fn record_kv_hit(&self) {
50        self.kv_cache_hits.fetch_add(1, Ordering::Relaxed);
51    }
52
53    /// Record a KV-cache miss (full prefill required).
54    pub fn record_kv_miss(&self) {
55        self.kv_cache_misses.fetch_add(1, Ordering::Relaxed);
56    }
57
58    /// Record that a new inference request has started.
59    pub fn record_request_start(&self) {
60        self.requests_started.fetch_add(1, Ordering::Relaxed);
61    }
62
63    /// Record that an inference request has completed.
64    pub fn record_request_complete(&self) {
65        self.requests_completed.fetch_add(1, Ordering::Relaxed);
66    }
67
68    /// Returns `(decode_tokens_per_sec, prefill_tokens_per_sec)`.
69    ///
70    /// Both values are 0.0 if no time has been recorded for that phase.
71    pub fn throughput(&self) -> (f64, f64) {
72        let decode_tokens = self.tokens_generated.load(Ordering::Relaxed);
73        let decode_nanos = self.decode_nanos.load(Ordering::Relaxed);
74        let prefill_tokens = self.tokens_prefilled.load(Ordering::Relaxed);
75        let prefill_nanos = self.prefill_nanos.load(Ordering::Relaxed);
76
77        let decode_tps = if decode_nanos == 0 {
78            0.0_f64
79        } else {
80            decode_tokens as f64 / (decode_nanos as f64 * 1e-9)
81        };
82
83        let prefill_tps = if prefill_nanos == 0 {
84            0.0_f64
85        } else {
86            prefill_tokens as f64 / (prefill_nanos as f64 * 1e-9)
87        };
88
89        (decode_tps, prefill_tps)
90    }
91
92    /// Returns KV cache hit rate in the range `[0.0, 1.0]`.
93    ///
94    /// Returns 0.0 when no lookups have been recorded.
95    pub fn kv_cache_hit_rate(&self) -> f64 {
96        let hits = self.kv_cache_hits.load(Ordering::Relaxed);
97        let misses = self.kv_cache_misses.load(Ordering::Relaxed);
98        let total = hits + misses;
99        if total == 0 {
100            0.0
101        } else {
102            hits as f64 / total as f64
103        }
104    }
105
106    /// Returns a point-in-time snapshot of all counters.
107    pub fn snapshot(&self) -> MetricsSnapshot {
108        let (decode_tps, prefill_tps) = self.throughput();
109        MetricsSnapshot {
110            tokens_generated: self.tokens_generated.load(Ordering::Relaxed),
111            tokens_prefilled: self.tokens_prefilled.load(Ordering::Relaxed),
112            decode_tokens_per_sec: decode_tps,
113            prefill_tokens_per_sec: prefill_tps,
114            kv_cache_hit_rate: self.kv_cache_hit_rate(),
115            requests_started: self.requests_started.load(Ordering::Relaxed),
116            requests_completed: self.requests_completed.load(Ordering::Relaxed),
117        }
118    }
119
120    /// Reset all counters to zero.
121    pub fn reset(&self) {
122        self.tokens_generated.store(0, Ordering::Relaxed);
123        self.tokens_prefilled.store(0, Ordering::Relaxed);
124        self.kv_cache_hits.store(0, Ordering::Relaxed);
125        self.kv_cache_misses.store(0, Ordering::Relaxed);
126        self.decode_nanos.store(0, Ordering::Relaxed);
127        self.prefill_nanos.store(0, Ordering::Relaxed);
128        self.requests_started.store(0, Ordering::Relaxed);
129        self.requests_completed.store(0, Ordering::Relaxed);
130    }
131}
132
133/// A point-in-time clone of [`EngineMetrics`] counters.
134#[derive(Debug, Clone)]
135pub struct MetricsSnapshot {
136    /// Total tokens produced in the decode phase.
137    pub tokens_generated: u64,
138    /// Total tokens processed in the prefill phase.
139    pub tokens_prefilled: u64,
140    /// Decode throughput in tokens per second.
141    pub decode_tokens_per_sec: f64,
142    /// Prefill throughput in tokens per second.
143    pub prefill_tokens_per_sec: f64,
144    /// KV cache hit rate `[0.0, 1.0]`.
145    pub kv_cache_hit_rate: f64,
146    /// Number of requests started.
147    pub requests_started: u64,
148    /// Number of requests completed.
149    pub requests_completed: u64,
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn test_record_decode_token_increments() {
158        let m = EngineMetrics::new();
159        m.record_decode_token(Duration::from_secs(1));
160        m.record_decode_token(Duration::from_secs(1));
161        assert_eq!(m.tokens_generated.load(Ordering::Relaxed), 2);
162        assert_eq!(m.decode_nanos.load(Ordering::Relaxed), 2_000_000_000);
163    }
164
165    #[test]
166    fn test_throughput_decode() {
167        let m = EngineMetrics::new();
168        // 10 tokens in 1 second → 10 tok/s
169        for _ in 0..10 {
170            m.record_decode_token(Duration::from_millis(100));
171        }
172        let (decode_tps, prefill_tps) = m.throughput();
173        assert!((decode_tps - 10.0).abs() < 0.1, "decode_tps={decode_tps}");
174        assert_eq!(prefill_tps, 0.0);
175    }
176
177    #[test]
178    fn test_snapshot_fields() {
179        let m = EngineMetrics::new();
180        m.record_prefill(5, Duration::from_millis(50));
181        m.record_kv_hit();
182        m.record_kv_miss();
183        m.record_request_start();
184        m.record_request_complete();
185
186        let snap = m.snapshot();
187        assert_eq!(snap.tokens_prefilled, 5);
188        assert_eq!(snap.requests_started, 1);
189        assert_eq!(snap.requests_completed, 1);
190        assert!((snap.kv_cache_hit_rate - 0.5).abs() < 1e-9);
191    }
192
193    #[test]
194    fn test_reset_clears_all() {
195        let m = EngineMetrics::new();
196        m.record_decode_token(Duration::from_millis(10));
197        m.record_prefill(3, Duration::from_millis(5));
198        m.record_kv_hit();
199        m.reset();
200        let snap = m.snapshot();
201        assert_eq!(snap.tokens_generated, 0);
202        assert_eq!(snap.tokens_prefilled, 0);
203        assert_eq!(snap.decode_tokens_per_sec, 0.0);
204        assert_eq!(snap.kv_cache_hit_rate, 0.0);
205    }
206
207    #[test]
208    fn test_kv_cache_hit_rate_zero_when_no_lookups() {
209        let m = EngineMetrics::new();
210        assert_eq!(m.kv_cache_hit_rate(), 0.0);
211    }
212
213    #[test]
214    fn test_kv_cache_hit_rate_all_hits() {
215        let m = EngineMetrics::new();
216        m.record_kv_hit();
217        m.record_kv_hit();
218        assert!((m.kv_cache_hit_rate() - 1.0).abs() < 1e-9);
219    }
220}