switchboard_node_metrics/
lib.rs

1use prometheus::{opts, CounterVec, GaugeVec, HistogramVec, Opts, Registry};
2use std::sync::OnceLock;
3
4// A static variable representing the different node metrics to collect
5pub static SWITCHBOARD_METRICS: OnceLock<SwitchboardMetrics> = OnceLock::new();
6
7#[derive(Debug, Clone)]
8pub struct SwitchboardMetrics {
9    pub registry: Registry,
10    pub fn_backoff_counter: CounterVec,
11    pub fn_execution_stolen_counter: CounterVec,
12    pub request_counter: CounterVec,
13    pub boot_counter: CounterVec,
14    pub network_call_gauge: GaugeVec,
15    pub runtime_gauge: GaugeVec,
16    pub fn_error_code_gauge: GaugeVec,
17    pub unhandled_error_counter: CounterVec,
18    pub fn_timeout_counter: CounterVec,
19    pub oracle_available_permits_gauge: GaugeVec,
20    pub qvn_error_report_failed_counter: CounterVec,
21    pub oracle_img_dl_counter: CounterVec,
22    pub oracle_dl_routine_latency: GaugeVec,
23    pub oracle_awaiter_routine_latency: GaugeVec,
24    pub oracle_poller_latency: GaugeVec,
25    pub function_execution_histogram: HistogramVec,
26}
27
28impl SwitchboardMetrics {
29    pub fn get_or_init() -> &'static Self {
30        SWITCHBOARD_METRICS.get_or_init(SwitchboardMetrics::initialize)
31    }
32
33    pub fn initialize() -> Self {
34        let registry = Registry::new();
35
36        let fn_backoff_counter = CounterVec::new(
37            Opts::new(
38                "switchboard_function_backoff_counter",
39                "Function backoff counter",
40            ),
41            &["chain", "chain_id", "queue_key", "oracle_key"],
42        )
43        .unwrap();
44        prometheus::register(Box::new(fn_backoff_counter.clone())).unwrap();
45
46        let fn_execution_stolen_counter = CounterVec::new(
47            Opts::new(
48                "switchboard_function_stolen_execution_counter",
49                "Function execution stolen counter",
50            ),
51            &[
52                "chain",
53                "chain_id",
54                "queue_key",
55                "oracle_key",
56                "victim_oracle_key",
57                "function_key",
58            ],
59        )
60        .unwrap();
61        prometheus::register(Box::new(fn_execution_stolen_counter.clone())).unwrap();
62
63        let request_counter = CounterVec::new(
64            Opts::new(
65                "switchboard_function_request_counter",
66                "Function TCP request counter",
67            ),
68            &["chain", "chain_id", "queue_key", "oracle_key", "img_name"],
69        )
70        .unwrap();
71        prometheus::register(Box::new(request_counter.clone())).unwrap();
72
73        let boot_counter = CounterVec::new(
74            Opts::new(
75                "switchboard_function_manager_boot_counter",
76                "DIND Boot counter",
77            ),
78            &["chain", "chain_id", "queue_key", "oracle_key"],
79        )
80        .unwrap();
81        prometheus::register(Box::new(boot_counter.clone())).unwrap();
82
83        let network_call_gauge = GaugeVec::new(
84            Opts::new(
85                "switchboard_function_network_call_gauge",
86                "Network Call Counter",
87            ),
88            &["chain", "chain_id", "queue_key", "oracle_key", "function"],
89        )
90        .unwrap();
91        prometheus::register(Box::new(network_call_gauge.clone())).unwrap();
92
93        let runtime_gauge = GaugeVec::new(
94            Opts::new(
95                "switchboard_function_runtime_gauge",
96                "Function Runtime Gauge",
97            ),
98            &[
99                "chain",
100                "chain_id",
101                "queue_key",
102                "oracle_key",
103                "function_key",
104                "function_request_key",
105            ],
106        )
107        .unwrap();
108        prometheus::register(Box::new(runtime_gauge.clone())).unwrap();
109
110        let fn_error_code_gauge = GaugeVec::new(
111            Opts::new(
112                "switchboard_function_error_code_gauge",
113                "Function Runtime Gauge",
114            ),
115            &[
116                "chain",
117                "chain_id",
118                "queue_key",
119                "oracle_key",
120                "function_key",
121                "function_request_key",
122                "code",
123            ],
124        )
125        .unwrap();
126        prometheus::register(Box::new(fn_error_code_gauge.clone())).unwrap();
127
128        let unhandled_error_counter = CounterVec::new(
129            Opts::new(
130                "switchboard_function_unhandled_error_counter",
131                "Function unhandled error counter",
132            ),
133            &[
134                "chain",
135                "chain_id",
136                "queue_key",
137                "oracle_key",
138                "function_key",
139                "function_request_key",
140            ],
141        )
142        .unwrap();
143        prometheus::register(Box::new(unhandled_error_counter.clone())).unwrap();
144
145        let fn_timeout_counter = CounterVec::new(
146            Opts::new(
147                "switchboard_function_timeout_counter",
148                "Function run timeout counter",
149            ),
150            &[
151                "chain",
152                "chain_id",
153                "queue_key",
154                "oracle_key",
155                "function_key",
156            ],
157        )
158        .unwrap();
159        prometheus::register(Box::new(fn_timeout_counter.clone())).unwrap();
160
161        let oracle_available_permits_gauge = GaugeVec::new(
162            Opts::new("switchboard_oracle_available_permits_gauge", "ph"),
163            &["chain", "chain_id", "queue_key", "oracle_key"],
164        )
165        .unwrap();
166        prometheus::register(Box::new(oracle_available_permits_gauge.clone())).unwrap();
167
168        let qvn_error_report_failed_counter = CounterVec::new(
169            Opts::new("switchboard_qvn_error_report_failed_counter", "ph"),
170            &[
171                "chain",
172                "chain_id",
173                "queue_key",
174                "oracle_key",
175                "function_key",
176                "function_request_key",
177            ],
178        )
179        .unwrap();
180        prometheus::register(Box::new(qvn_error_report_failed_counter.clone())).unwrap();
181
182        let oracle_img_dl_counter = CounterVec::new(
183            Opts::new("switchboard_img_dl_counter", "ph"),
184            &["chain", "chain_id", "queue_key", "oracle_key", "container"],
185        )
186        .unwrap();
187        prometheus::register(Box::new(oracle_img_dl_counter.clone())).unwrap();
188
189        let oracle_dl_routine_latency = GaugeVec::new(
190            Opts::new("switchboard_oracle_dl_routine_latency", "ph"),
191            &["chain", "chain_id", "queue_key", "oracle_key"],
192        )
193        .unwrap();
194        prometheus::register(Box::new(oracle_dl_routine_latency.clone())).unwrap();
195
196        let oracle_awaiter_routine_latency = GaugeVec::new(
197            Opts::new("switchboard_oracle_awaiter_routine_latency", "ph"),
198            &["chain", "chain_id", "queue_key", "oracle_key"],
199        )
200        .unwrap();
201        prometheus::register(Box::new(oracle_awaiter_routine_latency.clone())).unwrap();
202
203        let oracle_poller_latency = GaugeVec::new(
204            Opts::new("switchboard_oracle_poller_routine_latency", "ph"),
205            &["chain", "chain_id", "queue_key", "oracle_key"],
206        )
207        .unwrap();
208        prometheus::register(Box::new(oracle_poller_latency.clone())).unwrap();
209
210        let function_execution_histogram = HistogramVec::new(
211            opts!(
212                "function_execution_duration",
213                "Function execution duration in seconds"
214            )
215            .into(),
216            &["task_id"],
217        )
218        .expect("Failed to create function_execution_duration histogram");
219        prometheus::register(Box::new(function_execution_histogram.clone())).unwrap();
220
221        SwitchboardMetrics {
222            registry,
223            fn_backoff_counter,
224            fn_execution_stolen_counter,
225            request_counter,
226            boot_counter,
227            network_call_gauge,
228            runtime_gauge,
229            fn_error_code_gauge,
230            unhandled_error_counter,
231            fn_timeout_counter,
232            oracle_available_permits_gauge,
233            qvn_error_report_failed_counter,
234            oracle_img_dl_counter,
235            oracle_dl_routine_latency,
236            oracle_awaiter_routine_latency,
237            oracle_poller_latency,
238            function_execution_histogram,
239        }
240    }
241
242    pub fn record_fn_execution_duration(&self, task_id: &str, duration_secs: f64) {
243        self.function_execution_histogram
244            .with_label_values(&[task_id])
245            .observe(duration_secs);
246    }
247
248    // TODO: add set_max methods for gauges
249}
250
251// TODO: add label! macro
252
253// #[cfg(test)]
254// mod tests {
255//     use super::*;
256
257//     #[test]
258//     fn it_works() {
259//         let result = 2 + 2;
260//         assert_eq!(result, 4);
261//     }
262// }