Skip to main content

trueno/brick/
perf_metrics.rs

1//! Performance Metrics Breakdown
2//!
3//! LCP-04 pattern from llama.cpp for tracking inference timing.
4
5// ----------------------------------------------------------------------------
6// LCP-04: Performance Metrics Breakdown (llama.cpp pattern)
7// ----------------------------------------------------------------------------
8
9/// Performance metrics breakdown for inference phases.
10///
11/// Tracks timing for each phase of LLM inference:
12/// - Model loading (t_load_ms)
13/// - Prompt evaluation / prefill (t_p_eval_ms)
14/// - Token generation / decode (t_eval_ms)
15///
16/// # Example
17/// ```rust,ignore
18/// use trueno::brick::PerfMetrics;
19///
20/// let mut metrics = PerfMetrics::default();
21/// metrics.record_load(1500);  // 1.5s model load
22/// metrics.record_prefill(200, 512);  // 200ms for 512 prompt tokens
23/// metrics.record_decode(50);  // 50ms per generated token
24///
25/// println!("{}", metrics.summary());
26/// ```
27#[derive(Debug, Clone, Default)]
28pub struct PerfMetrics {
29    /// Model loading time (milliseconds)
30    pub t_load_ms: u64,
31    /// Prompt evaluation time - prefill phase (milliseconds)
32    pub t_p_eval_ms: u64,
33    /// Token generation time - decode phase (milliseconds)
34    pub t_eval_ms: u64,
35    /// Number of tokens in prompt (prefill)
36    pub n_p_eval: u32,
37    /// Number of tokens generated (decode)
38    pub n_eval: u32,
39    /// Sample count for t_eval (for averaging)
40    pub n_samples: u32,
41}
42
43impl PerfMetrics {
44    /// Create new metrics instance.
45    pub fn new() -> Self {
46        Self::default()
47    }
48
49    /// Record model loading time.
50    pub fn record_load(&mut self, ms: u64) {
51        self.t_load_ms = ms;
52    }
53
54    /// Record prefill (prompt evaluation) time.
55    pub fn record_prefill(&mut self, ms: u64, tokens: u32) {
56        self.t_p_eval_ms = ms;
57        self.n_p_eval = tokens;
58    }
59
60    /// Record a single decode step.
61    pub fn record_decode(&mut self, ms: u64) {
62        self.t_eval_ms += ms;
63        self.n_eval += 1;
64        self.n_samples += 1;
65    }
66
67    /// Record batch decode step.
68    pub fn record_decode_batch(&mut self, ms: u64, tokens: u32) {
69        self.t_eval_ms += ms;
70        self.n_eval += tokens;
71        self.n_samples += 1;
72    }
73
74    /// Tokens per second during generation (decode throughput).
75    #[must_use]
76    pub fn tokens_per_second(&self) -> f64 {
77        if self.t_eval_ms == 0 {
78            0.0
79        } else {
80            1000.0 * self.n_eval as f64 / self.t_eval_ms as f64
81        }
82    }
83
84    /// Tokens per second during prompt evaluation (prefill throughput).
85    #[must_use]
86    pub fn prefill_tokens_per_second(&self) -> f64 {
87        if self.t_p_eval_ms == 0 {
88            0.0
89        } else {
90            1000.0 * self.n_p_eval as f64 / self.t_p_eval_ms as f64
91        }
92    }
93
94    /// Total time for complete inference.
95    #[must_use]
96    pub fn total_ms(&self) -> u64 {
97        self.t_load_ms + self.t_p_eval_ms + self.t_eval_ms
98    }
99
100    /// Time-to-first-token (TTFT).
101    #[must_use]
102    pub fn time_to_first_token_ms(&self) -> u64 {
103        self.t_load_ms + self.t_p_eval_ms
104    }
105
106    /// Average time per token during decode.
107    #[must_use]
108    pub fn avg_token_latency_ms(&self) -> f64 {
109        if self.n_eval == 0 {
110            0.0
111        } else {
112            self.t_eval_ms as f64 / self.n_eval as f64
113        }
114    }
115
116    /// Formatted summary string.
117    #[must_use]
118    pub fn summary(&self) -> String {
119        format!(
120            "load: {}ms, prefill: {}ms ({:.1} tok/s, {} tokens), decode: {}ms ({:.1} tok/s, {} tokens), total: {}ms",
121            self.t_load_ms,
122            self.t_p_eval_ms,
123            self.prefill_tokens_per_second(),
124            self.n_p_eval,
125            self.t_eval_ms,
126            self.tokens_per_second(),
127            self.n_eval,
128            self.total_ms()
129        )
130    }
131
132    /// Reset all metrics.
133    pub fn reset(&mut self) {
134        *self = Self::default();
135    }
136}
137
138// ----------------------------------------------------------------------------
139// LCP-01: Inference Phase (for Arena Allocation)
140// ----------------------------------------------------------------------------
141
142/// Inference phase for dual-arena allocation.
143#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
144pub enum InferencePhase {
145    /// Processing prompt, large batches
146    #[default]
147    Prefill,
148    /// Generating tokens, small batches
149    Decode,
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn test_perf_metrics_default() {
158        let metrics = PerfMetrics::default();
159        assert_eq!(metrics.t_load_ms, 0);
160        assert_eq!(metrics.t_p_eval_ms, 0);
161        assert_eq!(metrics.t_eval_ms, 0);
162        assert_eq!(metrics.n_p_eval, 0);
163        assert_eq!(metrics.n_eval, 0);
164    }
165
166    #[test]
167    fn test_perf_metrics_record_load() {
168        let mut metrics = PerfMetrics::new();
169        metrics.record_load(1500);
170        assert_eq!(metrics.t_load_ms, 1500);
171    }
172
173    #[test]
174    fn test_perf_metrics_record_prefill() {
175        let mut metrics = PerfMetrics::new();
176        metrics.record_prefill(200, 512);
177        assert_eq!(metrics.t_p_eval_ms, 200);
178        assert_eq!(metrics.n_p_eval, 512);
179    }
180
181    #[test]
182    fn test_perf_metrics_record_decode() {
183        let mut metrics = PerfMetrics::new();
184        metrics.record_decode(50);
185        metrics.record_decode(50);
186        assert_eq!(metrics.t_eval_ms, 100);
187        assert_eq!(metrics.n_eval, 2);
188        assert_eq!(metrics.n_samples, 2);
189    }
190
191    #[test]
192    fn test_perf_metrics_tokens_per_second() {
193        let mut metrics = PerfMetrics::new();
194        metrics.record_decode_batch(1000, 100); // 100 tokens in 1 second
195        assert!((metrics.tokens_per_second() - 100.0).abs() < 0.001);
196    }
197
198    #[test]
199    fn test_perf_metrics_prefill_throughput() {
200        let mut metrics = PerfMetrics::new();
201        metrics.record_prefill(500, 1000); // 1000 tokens in 500ms = 2000 tok/s
202        assert!((metrics.prefill_tokens_per_second() - 2000.0).abs() < 0.001);
203    }
204
205    #[test]
206    fn test_perf_metrics_total_ms() {
207        let mut metrics = PerfMetrics::new();
208        metrics.record_load(1000);
209        metrics.record_prefill(200, 512);
210        metrics.record_decode_batch(300, 100);
211        assert_eq!(metrics.total_ms(), 1500);
212    }
213
214    #[test]
215    fn test_perf_metrics_time_to_first_token() {
216        let mut metrics = PerfMetrics::new();
217        metrics.record_load(1000);
218        metrics.record_prefill(200, 512);
219        assert_eq!(metrics.time_to_first_token_ms(), 1200);
220    }
221
222    #[test]
223    fn test_perf_metrics_reset() {
224        let mut metrics = PerfMetrics::new();
225        metrics.record_load(1500);
226        metrics.record_prefill(200, 512);
227        metrics.reset();
228        assert_eq!(metrics.t_load_ms, 0);
229        assert_eq!(metrics.n_p_eval, 0);
230    }
231
232    #[test]
233    fn test_inference_phase_default() {
234        let phase = InferencePhase::default();
235        assert_eq!(phase, InferencePhase::Prefill);
236    }
237
238    #[test]
239    fn test_inference_phase_eq() {
240        assert_eq!(InferencePhase::Prefill, InferencePhase::Prefill);
241        assert_eq!(InferencePhase::Decode, InferencePhase::Decode);
242        assert_ne!(InferencePhase::Prefill, InferencePhase::Decode);
243    }
244}