rustfoundry/telemetry/tokio_runtime_metrics/
metrics.rs

1use crate::telemetry::metrics::{metrics, Counter, Gauge};
2use std::sync::Arc;
3
4#[metrics(crate_path = "crate")]
5pub(super) mod tokio_runtime_core {
6    /// Number of worker threads in use by the runtime.
7    ///
8    /// This number shouldn't change during execution.
9    pub fn workers(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
10
11    /// Current number of blocking threads allocated by the runtime.
12    ///
13    /// This should ideally be less than the blocking threads limit, otherwise you may be experiencing
14    /// resource saturation at least some proportion of the time.
15    pub fn blocking_threads(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
16
17    /// Current number of active tasks on the runtime.
18    pub fn num_alive_tasks(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
19
20    /// Current number of idle blocking threads on the runtime which aren't doing anything.
21    ///
22    /// This can give a good idea of how much of the thread pool is being utilized.
23    ///
24    /// If this is a very low number relative to the number of allocated blocking threads,
25    /// and we are reaching the limit for blocking thread allocations,
26    /// then we may be experiencing saturation of the thread pool.
27    pub fn idle_blocking_threads(
28        runtime_name: &Option<Arc<str>>,
29        runtime_id: Option<usize>,
30    ) -> Gauge;
31
32    /// Counter of schedules not originating from a worker on the runtime.
33    ///
34    /// Remote schedules tend to be slower than local ones, and occur when a wake or spawn happens
35    /// off of a worker (e.g. on a background thread or in the block_on call).
36    pub fn remote_schedules_total(
37        runtime_name: &Option<Arc<str>>,
38        runtime_id: Option<usize>,
39    ) -> Counter;
40
41    /// Counter of forced yields due to task budgeting.
42    pub fn budget_forced_yields_total(
43        runtime_name: &Option<Arc<str>>,
44        runtime_id: Option<usize>,
45    ) -> Counter;
46
47    /// Counter of file descriptors registered with the IO driver.
48    pub fn io_driver_fd_registrations_total(
49        runtime_name: &Option<Arc<str>>,
50        runtime_id: Option<usize>,
51    ) -> Counter;
52
53    /// Counter of file descriptors deregistered with the IO driver.
54    pub fn io_driver_fd_deregistrations_total(
55        runtime_name: &Option<Arc<str>>,
56        runtime_id: Option<usize>,
57    ) -> Counter;
58
59    /// Counter of readiness events received via the IO driver.
60    pub fn io_driver_fd_readies_total(
61        runtime_name: &Option<Arc<str>>,
62        runtime_id: Option<usize>,
63    ) -> Counter;
64
65    /// Current depth of the tokio runtime global queue.
66    pub fn global_queue_depth(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
67
68    /// Current depth of the tokio runtime blocking queue.
69    ///
70    /// If this is growing, then we have saturated our blocking pool and either need more threads
71    /// or cgroups cpu time allotment.
72    pub fn blocking_queue_depth(
73        runtime_name: &Option<Arc<str>>,
74        runtime_id: Option<usize>,
75    ) -> Gauge;
76}
77
78#[metrics(crate_path = "crate")]
79pub(super) mod tokio_runtime_worker {
80    /// Total number of times this worker has parked.
81    pub fn parks_total(
82        runtime_name: &Option<Arc<str>>,
83        runtime_id: Option<usize>,
84        worker_idx: usize,
85    ) -> Counter;
86
87    /// Total number of spurious noop parks this worker has experienced.
88    ///
89    /// If this is happening a lot, it might be worth investigating what is happening in tokio and
90    /// potentially your kernel as well.
91    pub fn noops_total(
92        runtime_name: &Option<Arc<str>>,
93        runtime_id: Option<usize>,
94        worker_idx: usize,
95    ) -> Counter;
96
97    /// Total number of tasks stolen due to work-stealing by this worker.
98    pub fn task_steals_total(
99        runtime_name: &Option<Arc<str>>,
100        runtime_id: Option<usize>,
101        worker_idx: usize,
102    ) -> Counter;
103
104    /// Total number of times that this worker has stolen one or more tasks.
105    pub fn steal_operations_total(
106        runtime_name: &Option<Arc<str>>,
107        runtime_id: Option<usize>,
108        worker_idx: usize,
109    ) -> Counter;
110
111    /// Total number of times that this worker has polled a task.
112    pub fn polls_total(
113        runtime_name: &Option<Arc<str>>,
114        runtime_id: Option<usize>,
115        worker_idx: usize,
116    ) -> Counter;
117
118    /// Total amount of time that this worker has been polling tasks.
119    ///
120    /// Ideally, workers should be incrementing this threshold relatively evenly,
121    /// otherwise you are experiencing load balancing issues for some reason.
122    pub fn busy_duration_micros_total(
123        runtime_name: &Option<Arc<str>>,
124        runtime_id: Option<usize>,
125        worker_idx: usize,
126    ) -> Counter;
127
128    /// Total number of local schedules.
129    ///
130    /// Cumulatively, this should generally be high relative to remote schedules.
131    ///
132    /// Otherwise, you are seeing a high proportion of off-runtime wakes, which can be slower.
133    pub fn local_schedules_total(
134        runtime_name: &Option<Arc<str>>,
135        runtime_id: Option<usize>,
136        worker_idx: usize,
137    ) -> Counter;
138
139    /// Total number of times that this worker has overflown its local queue, pushing excess tasks
140    /// to the injector queue.
141    pub fn overflows_total(
142        runtime_name: &Option<Arc<str>>,
143        runtime_id: Option<usize>,
144        worker_idx: usize,
145    ) -> Counter;
146
147    /// Current depth of this worker's local run queue.
148    pub fn local_queue_depth(
149        runtime_name: &Option<Arc<str>>,
150        runtime_id: Option<usize>,
151        worker_idx: usize,
152    ) -> Gauge;
153
154    /// Moving average of task poll times for this worker.
155    pub fn mean_poll_time_micros(
156        runtime_name: &Option<Arc<str>>,
157        runtime_id: Option<usize>,
158        worker_idx: usize,
159    ) -> Gauge;
160}