Skip to main content

dynamo_runtime/metrics/
work_handler_perf.rs

1// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Transport breakdown metrics for work handler (backend side).
5//! Captures network transit (T2-T1) and backend processing time (T3-T2).
6
7use once_cell::sync::{Lazy, OnceCell};
8use prometheus::{Histogram, HistogramOpts};
9
10use super::prometheus_names::{name_prefix, work_handler};
11use crate::MetricsRegistry;
12
13fn work_handler_metric_name(suffix: &str) -> String {
14    format!("{}_{}", name_prefix::WORK_HANDLER, suffix)
15}
16
17/// Network transit: frontend send to backend receive (wall-clock, cross-process).
18pub static WORK_HANDLER_NETWORK_TRANSIT_SECONDS: Lazy<Histogram> = Lazy::new(|| {
19    Histogram::with_opts(
20        HistogramOpts::new(
21            work_handler_metric_name(work_handler::NETWORK_TRANSIT_SECONDS),
22            "Frontend-to-backend network transit time (cross-process wall-clock, seconds)",
23        )
24        .buckets(vec![
25            0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0,
26        ]),
27    )
28    .expect("work_handler_network_transit_seconds histogram")
29});
30
31/// Backend processing: handle_payload entry to first response sent.
32pub static WORK_HANDLER_TIME_TO_FIRST_RESPONSE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
33    Histogram::with_opts(
34        HistogramOpts::new(
35            work_handler_metric_name(work_handler::TIME_TO_FIRST_RESPONSE_SECONDS),
36            "Backend processing time from handle_payload entry to prologue sent (seconds)",
37        )
38        .buckets(vec![
39            0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0,
40        ]),
41    )
42    .expect("work_handler_time_to_first_response_seconds histogram")
43});
44
45/// Guards idempotency for the `MetricsRegistry` registration path.
46static METRICS_REGISTERED: OnceCell<()> = OnceCell::new();
47
48/// Guards idempotency for the raw `prometheus::Registry` registration path.
49/// Kept separate from `METRICS_REGISTERED` so that calling `ensure_work_handler_perf_metrics_registered`
50/// first does not silently prevent the metrics from being registered in the prometheus registry.
51static PROMETHEUS_REGISTERED: OnceCell<Result<(), String>> = OnceCell::new();
52
53/// Register work handler transport breakdown metrics with the given registry. Idempotent.
54pub fn ensure_work_handler_perf_metrics_registered(registry: &MetricsRegistry) {
55    let _ = METRICS_REGISTERED.get_or_init(|| {
56        registry.add_metric_or_warn(
57            Box::new(WORK_HANDLER_NETWORK_TRANSIT_SECONDS.clone()),
58            "work_handler_network_transit_seconds",
59        );
60        registry.add_metric_or_warn(
61            Box::new(WORK_HANDLER_TIME_TO_FIRST_RESPONSE_SECONDS.clone()),
62            "work_handler_time_to_first_response_seconds",
63        );
64    });
65}
66
67/// Register with a raw Prometheus registry. Idempotent.
68pub fn ensure_work_handler_perf_metrics_registered_prometheus(
69    registry: &prometheus::Registry,
70) -> Result<(), prometheus::Error> {
71    PROMETHEUS_REGISTERED
72        .get_or_init(|| {
73            (|| -> Result<(), prometheus::Error> {
74                registry.register(Box::new(WORK_HANDLER_NETWORK_TRANSIT_SECONDS.clone()))?;
75                registry.register(Box::new(
76                    WORK_HANDLER_TIME_TO_FIRST_RESPONSE_SECONDS.clone(),
77                ))?;
78                Ok(())
79            })()
80            .map_err(|e| e.to_string())
81        })
82        .as_ref()
83        .map(|_| ())
84        .map_err(|e| prometheus::Error::Msg(e.clone()))
85}