Skip to main content

dynamo_runtime/metrics/
frontend_perf.rs

1// SPDX-FileCopyrightText: Copyright (c) 2026-2027 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Frontend pipeline stage and finer-grained perf metrics.
5//! Used by both runtime (route, transport_roundtrip) and llm (preprocess, postprocess, tokenize, template, detokenize).
6
7use once_cell::sync::{Lazy, OnceCell};
8use prometheus::{Counter, Histogram, HistogramOpts, HistogramVec, IntGaugeVec, Opts, Registry};
9
10use super::prometheus_names::{frontend_perf, name_prefix};
11use crate::MetricsRegistry;
12
13pub use super::prometheus_names::frontend_perf::{STAGE_DISPATCH, STAGE_PREPROCESS, STAGE_ROUTE};
14
15fn frontend_metric_name(suffix: &str) -> String {
16    format!("{}_{}", name_prefix::FRONTEND, suffix)
17}
18
19/// Per-stage inflight request count: preprocess, route, dispatch.
20/// Labels: stage (pipeline stage), phase (prefill/decode/aggregated or empty for preprocess).
21pub static STAGE_REQUESTS: Lazy<IntGaugeVec> = Lazy::new(|| {
22    IntGaugeVec::new(
23        Opts::new(
24            frontend_metric_name(frontend_perf::STAGE_REQUESTS),
25            "Number of requests currently in the given pipeline stage",
26        ),
27        &["stage", "phase"],
28    )
29    .expect("failed to create dynamo_frontend_stage_requests gauge")
30});
31
32/// RAII guard that increments a per-stage gauge on creation and decrements on drop.
33///
34/// Used to track how many requests are in each frontend pipeline stage at any given time.
35/// Create with [`StageGuard::new`] at stage entry; the gauge decrements automatically when
36/// the guard is dropped (end of scope, explicit drop, or stream completion).
37pub struct StageGuard {
38    gauge: prometheus::IntGauge,
39}
40
41impl StageGuard {
42    /// Increment the stage gauge and return a guard that decrements on drop.
43    ///
44    /// * `stage` — pipeline stage name; use `frontend_perf::STAGE_{PREPROCESS,ROUTE,DISPATCH}`
45    ///   constants from [`crate::metrics::prometheus_names`].
46    /// * `phase` — request phase; use [`RequestPhase::to_string`] output
47    ///   (`"prefill"|"decode"|"aggregated"`), or `""` for stages without a phase.
48    pub fn new(stage: &str, phase: &str) -> Self {
49        let gauge = STAGE_REQUESTS.with_label_values(&[stage, phase]);
50        gauge.inc();
51        Self { gauge }
52    }
53}
54
55impl Drop for StageGuard {
56    fn drop(&mut self) {
57        self.gauge.dec();
58    }
59}
60
61/// Per-stage latency: preprocess, route, transport_roundtrip, postprocess.
62pub static STAGE_DURATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
63    HistogramVec::new(
64        HistogramOpts::new(
65            frontend_metric_name(frontend_perf::STAGE_DURATION_SECONDS),
66            "Pipeline stage duration (seconds)",
67        )
68        .buckets(vec![
69            0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0,
70        ]),
71        &["stage"],
72    )
73    .expect("stage_duration_seconds histogram vec")
74});
75
76/// Tokenization time in preprocessor (gather_tokens).
77pub static TOKENIZE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
78    Histogram::with_opts(
79        HistogramOpts::new(
80            frontend_metric_name(frontend_perf::TOKENIZE_SECONDS),
81            "Tokenization time in preprocessor (seconds)",
82        )
83        .buckets(vec![
84            0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0,
85        ]),
86    )
87    .expect("tokenize_seconds histogram")
88});
89
90/// Template application time in preprocessor (apply_template).
91pub static TEMPLATE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
92    Histogram::with_opts(
93        HistogramOpts::new(
94            frontend_metric_name(frontend_perf::TEMPLATE_SECONDS),
95            "Template application time in preprocessor (seconds)",
96        )
97        .buckets(vec![
98            0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05,
99        ]),
100    )
101    .expect("template_seconds histogram")
102});
103
104/// Cumulative detokenization time across all tokens (microseconds).
105/// Use `rate(total) / rate(count)` in Prometheus to derive per-token average.
106pub static DETOKENIZE_TOTAL_US: Lazy<Counter> = Lazy::new(|| {
107    Counter::with_opts(Opts::new(
108        frontend_metric_name(frontend_perf::DETOKENIZE_TOTAL_US),
109        "Cumulative detokenization time (microseconds)",
110    ))
111    .expect("detokenize_total_us counter")
112});
113
114/// Total number of tokens detokenized.
115pub static DETOKENIZE_TOKEN_COUNT: Lazy<Counter> = Lazy::new(|| {
116    Counter::with_opts(Opts::new(
117        frontend_metric_name(frontend_perf::DETOKENIZE_TOKEN_COUNT),
118        "Total tokens detokenized",
119    ))
120    .expect("detokenize_token_count counter")
121});
122
123/// Cumulative L1 tokenizer cache hits. Only nonzero when `DYN_TOKENIZER_CACHE=1`.
124pub static TOKENIZER_CACHE_HITS_TOTAL: Lazy<Counter> = Lazy::new(|| {
125    Counter::with_opts(Opts::new(
126        frontend_metric_name(frontend_perf::TOKENIZER_CACHE_HITS_TOTAL),
127        "Cumulative L1 tokenizer prefix-cache hits",
128    ))
129    .expect("tokenizer_cache_hits_total counter")
130});
131
132/// Cumulative L1 tokenizer cache misses. Only nonzero when `DYN_TOKENIZER_CACHE=1`.
133pub static TOKENIZER_CACHE_MISSES_TOTAL: Lazy<Counter> = Lazy::new(|| {
134    Counter::with_opts(Opts::new(
135        frontend_metric_name(frontend_perf::TOKENIZER_CACHE_MISSES_TOTAL),
136        "Cumulative L1 tokenizer prefix-cache misses",
137    ))
138    .expect("tokenizer_cache_misses_total counter")
139});
140
141/// Guards idempotency for the `MetricsRegistry` registration path.
142static REGISTERED: OnceCell<()> = OnceCell::new();
143
144/// Guards idempotency for the raw `prometheus::Registry` registration path.
145/// Kept separate from `REGISTERED` so that calling `ensure_frontend_perf_metrics_registered`
146/// first does not silently prevent the metrics from being registered in the prometheus registry.
147static PROMETHEUS_REGISTERED: OnceCell<()> = OnceCell::new();
148
149/// Register frontend perf metrics with the given registry. Idempotent.
150pub fn ensure_frontend_perf_metrics_registered(registry: &MetricsRegistry) {
151    let _ = REGISTERED.get_or_init(|| {
152        registry.add_metric(Box::new(STAGE_REQUESTS.clone())).ok();
153        registry
154            .add_metric(Box::new(STAGE_DURATION_SECONDS.clone()))
155            .ok();
156        registry.add_metric(Box::new(TOKENIZE_SECONDS.clone())).ok();
157        registry.add_metric(Box::new(TEMPLATE_SECONDS.clone())).ok();
158        registry
159            .add_metric(Box::new(DETOKENIZE_TOTAL_US.clone()))
160            .ok();
161        registry
162            .add_metric(Box::new(DETOKENIZE_TOKEN_COUNT.clone()))
163            .ok();
164        registry
165            .add_metric(Box::new(TOKENIZER_CACHE_HITS_TOTAL.clone()))
166            .ok();
167        registry
168            .add_metric(Box::new(TOKENIZER_CACHE_MISSES_TOTAL.clone()))
169            .ok();
170    });
171}
172
173/// Register frontend perf metrics with a raw Prometheus registry (e.g. for LLM HTTP service /metrics).
174/// Idempotent. Call this when the service exposes /metrics from its own registry.
175pub fn ensure_frontend_perf_metrics_registered_prometheus(
176    registry: &Registry,
177) -> Result<(), prometheus::Error> {
178    if PROMETHEUS_REGISTERED.get().is_some() {
179        return Ok(());
180    }
181    registry.register(Box::new(STAGE_REQUESTS.clone()))?;
182    registry.register(Box::new(STAGE_DURATION_SECONDS.clone()))?;
183    registry.register(Box::new(TOKENIZE_SECONDS.clone()))?;
184    registry.register(Box::new(TEMPLATE_SECONDS.clone()))?;
185    registry.register(Box::new(DETOKENIZE_TOTAL_US.clone()))?;
186    registry.register(Box::new(DETOKENIZE_TOKEN_COUNT.clone()))?;
187    registry.register(Box::new(TOKENIZER_CACHE_HITS_TOTAL.clone()))?;
188    registry.register(Box::new(TOKENIZER_CACHE_MISSES_TOTAL.clone()))?;
189    let _ = PROMETHEUS_REGISTERED.set(());
190    Ok(())
191}
192
193#[cfg(test)]
194mod tests {
195    use super::*;
196
197    #[test]
198    fn test_stage_guard_inc_dec() {
199        let gauge = STAGE_REQUESTS.with_label_values(&["test_stage", "test_phase"]);
200        assert_eq!(gauge.get(), 0);
201
202        {
203            let _guard = StageGuard::new("test_stage", "test_phase");
204            assert_eq!(gauge.get(), 1);
205
206            {
207                let _guard2 = StageGuard::new("test_stage", "test_phase");
208                assert_eq!(gauge.get(), 2);
209            }
210            // guard2 dropped
211            assert_eq!(gauge.get(), 1);
212        }
213        // guard dropped
214        assert_eq!(gauge.get(), 0);
215    }
216
217    #[test]
218    fn test_stage_guard_different_labels() {
219        let preprocess = STAGE_REQUESTS.with_label_values(&["preprocess_t", ""]);
220        let route_prefill = STAGE_REQUESTS.with_label_values(&["route_t", "prefill"]);
221        let route_decode = STAGE_REQUESTS.with_label_values(&["route_t", "decode"]);
222
223        let _g1 = StageGuard::new("preprocess_t", "");
224        let _g2 = StageGuard::new("route_t", "prefill");
225        let _g3 = StageGuard::new("route_t", "decode");
226
227        assert_eq!(preprocess.get(), 1);
228        assert_eq!(route_prefill.get(), 1);
229        assert_eq!(route_decode.get(), 1);
230
231        drop(_g2);
232        assert_eq!(preprocess.get(), 1);
233        assert_eq!(route_prefill.get(), 0);
234        assert_eq!(route_decode.get(), 1);
235    }
236}