rustfoundry/telemetry/tokio_runtime_metrics/metrics.rs
1use crate::telemetry::metrics::{metrics, Counter, Gauge};
2use std::sync::Arc;
3
4#[metrics(crate_path = "crate")]
5pub(super) mod tokio_runtime_core {
6 /// Number of worker threads in use by the runtime.
7 ///
8 /// This number shouldn't change during execution.
9 pub fn workers(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
10
11 /// Current number of blocking threads allocated by the runtime.
12 ///
13 /// This should ideally be less than the blocking threads limit, otherwise you may be experiencing
14 /// resource saturation at least some proportion of the time.
15 pub fn blocking_threads(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
16
17 /// Current number of active tasks on the runtime.
18 pub fn num_alive_tasks(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
19
20 /// Current number of idle blocking threads on the runtime which aren't doing anything.
21 ///
22 /// This can give a good idea of how much of the thread pool is being utilized.
23 ///
24 /// If this is a very low number relative to the number of allocated blocking threads,
25 /// and we are reaching the limit for blocking thread allocations,
26 /// then we may be experiencing saturation of the thread pool.
27 pub fn idle_blocking_threads(
28 runtime_name: &Option<Arc<str>>,
29 runtime_id: Option<usize>,
30 ) -> Gauge;
31
32 /// Counter of schedules not originating from a worker on the runtime.
33 ///
34 /// Remote schedules tend to be slower than local ones, and occur when a wake or spawn happens
35 /// off of a worker (e.g. on a background thread or in the block_on call).
36 pub fn remote_schedules_total(
37 runtime_name: &Option<Arc<str>>,
38 runtime_id: Option<usize>,
39 ) -> Counter;
40
41 /// Counter of forced yields due to task budgeting.
42 pub fn budget_forced_yields_total(
43 runtime_name: &Option<Arc<str>>,
44 runtime_id: Option<usize>,
45 ) -> Counter;
46
47 /// Counter of file descriptors registered with the IO driver.
48 pub fn io_driver_fd_registrations_total(
49 runtime_name: &Option<Arc<str>>,
50 runtime_id: Option<usize>,
51 ) -> Counter;
52
53 /// Counter of file descriptors deregistered with the IO driver.
54 pub fn io_driver_fd_deregistrations_total(
55 runtime_name: &Option<Arc<str>>,
56 runtime_id: Option<usize>,
57 ) -> Counter;
58
59 /// Counter of readiness events received via the IO driver.
60 pub fn io_driver_fd_readies_total(
61 runtime_name: &Option<Arc<str>>,
62 runtime_id: Option<usize>,
63 ) -> Counter;
64
65 /// Current depth of the tokio runtime global queue.
66 pub fn global_queue_depth(runtime_name: &Option<Arc<str>>, runtime_id: Option<usize>) -> Gauge;
67
68 /// Current depth of the tokio runtime blocking queue.
69 ///
70 /// If this is growing, then we have saturated our blocking pool and either need more threads
71 /// or cgroups cpu time allotment.
72 pub fn blocking_queue_depth(
73 runtime_name: &Option<Arc<str>>,
74 runtime_id: Option<usize>,
75 ) -> Gauge;
76}
77
78#[metrics(crate_path = "crate")]
79pub(super) mod tokio_runtime_worker {
80 /// Total number of times this worker has parked.
81 pub fn parks_total(
82 runtime_name: &Option<Arc<str>>,
83 runtime_id: Option<usize>,
84 worker_idx: usize,
85 ) -> Counter;
86
87 /// Total number of spurious noop parks this worker has experienced.
88 ///
89 /// If this is happening a lot, it might be worth investigating what is happening in tokio and
90 /// potentially your kernel as well.
91 pub fn noops_total(
92 runtime_name: &Option<Arc<str>>,
93 runtime_id: Option<usize>,
94 worker_idx: usize,
95 ) -> Counter;
96
97 /// Total number of tasks stolen due to work-stealing by this worker.
98 pub fn task_steals_total(
99 runtime_name: &Option<Arc<str>>,
100 runtime_id: Option<usize>,
101 worker_idx: usize,
102 ) -> Counter;
103
104 /// Total number of times that this worker has stolen one or more tasks.
105 pub fn steal_operations_total(
106 runtime_name: &Option<Arc<str>>,
107 runtime_id: Option<usize>,
108 worker_idx: usize,
109 ) -> Counter;
110
111 /// Total number of times that this worker has polled a task.
112 pub fn polls_total(
113 runtime_name: &Option<Arc<str>>,
114 runtime_id: Option<usize>,
115 worker_idx: usize,
116 ) -> Counter;
117
118 /// Total amount of time that this worker has been polling tasks.
119 ///
120 /// Ideally, workers should be incrementing this threshold relatively evenly,
121 /// otherwise you are experiencing load balancing issues for some reason.
122 pub fn busy_duration_micros_total(
123 runtime_name: &Option<Arc<str>>,
124 runtime_id: Option<usize>,
125 worker_idx: usize,
126 ) -> Counter;
127
128 /// Total number of local schedules.
129 ///
130 /// Cumulatively, this should generally be high relative to remote schedules.
131 ///
132 /// Otherwise, you are seeing a high proportion of off-runtime wakes, which can be slower.
133 pub fn local_schedules_total(
134 runtime_name: &Option<Arc<str>>,
135 runtime_id: Option<usize>,
136 worker_idx: usize,
137 ) -> Counter;
138
139 /// Total number of times that this worker has overflown its local queue, pushing excess tasks
140 /// to the injector queue.
141 pub fn overflows_total(
142 runtime_name: &Option<Arc<str>>,
143 runtime_id: Option<usize>,
144 worker_idx: usize,
145 ) -> Counter;
146
147 /// Current depth of this worker's local run queue.
148 pub fn local_queue_depth(
149 runtime_name: &Option<Arc<str>>,
150 runtime_id: Option<usize>,
151 worker_idx: usize,
152 ) -> Gauge;
153
154 /// Moving average of task poll times for this worker.
155 pub fn mean_poll_time_micros(
156 runtime_name: &Option<Arc<str>>,
157 runtime_id: Option<usize>,
158 worker_idx: usize,
159 ) -> Gauge;
160}