1use prometheus::{opts, CounterVec, GaugeVec, HistogramVec, Opts, Registry};
2use std::sync::OnceLock;
3
4pub static SWITCHBOARD_METRICS: OnceLock<SwitchboardMetrics> = OnceLock::new();
6
7#[derive(Debug, Clone)]
8pub struct SwitchboardMetrics {
9 pub registry: Registry,
10 pub fn_backoff_counter: CounterVec,
11 pub fn_execution_stolen_counter: CounterVec,
12 pub request_counter: CounterVec,
13 pub boot_counter: CounterVec,
14 pub network_call_gauge: GaugeVec,
15 pub runtime_gauge: GaugeVec,
16 pub fn_error_code_gauge: GaugeVec,
17 pub unhandled_error_counter: CounterVec,
18 pub fn_timeout_counter: CounterVec,
19 pub oracle_available_permits_gauge: GaugeVec,
20 pub qvn_error_report_failed_counter: CounterVec,
21 pub oracle_img_dl_counter: CounterVec,
22 pub oracle_dl_routine_latency: GaugeVec,
23 pub oracle_awaiter_routine_latency: GaugeVec,
24 pub oracle_poller_latency: GaugeVec,
25 pub function_execution_histogram: HistogramVec,
26}
27
28impl SwitchboardMetrics {
29 pub fn get_or_init() -> &'static Self {
30 SWITCHBOARD_METRICS.get_or_init(SwitchboardMetrics::initialize)
31 }
32
33 pub fn initialize() -> Self {
34 let registry = Registry::new();
35
36 let fn_backoff_counter = CounterVec::new(
37 Opts::new(
38 "switchboard_function_backoff_counter",
39 "Function backoff counter",
40 ),
41 &["chain", "chain_id", "queue_key", "oracle_key"],
42 )
43 .unwrap();
44 prometheus::register(Box::new(fn_backoff_counter.clone())).unwrap();
45
46 let fn_execution_stolen_counter = CounterVec::new(
47 Opts::new(
48 "switchboard_function_stolen_execution_counter",
49 "Function execution stolen counter",
50 ),
51 &[
52 "chain",
53 "chain_id",
54 "queue_key",
55 "oracle_key",
56 "victim_oracle_key",
57 "function_key",
58 ],
59 )
60 .unwrap();
61 prometheus::register(Box::new(fn_execution_stolen_counter.clone())).unwrap();
62
63 let request_counter = CounterVec::new(
64 Opts::new(
65 "switchboard_function_request_counter",
66 "Function TCP request counter",
67 ),
68 &["chain", "chain_id", "queue_key", "oracle_key", "img_name"],
69 )
70 .unwrap();
71 prometheus::register(Box::new(request_counter.clone())).unwrap();
72
73 let boot_counter = CounterVec::new(
74 Opts::new(
75 "switchboard_function_manager_boot_counter",
76 "DIND Boot counter",
77 ),
78 &["chain", "chain_id", "queue_key", "oracle_key"],
79 )
80 .unwrap();
81 prometheus::register(Box::new(boot_counter.clone())).unwrap();
82
83 let network_call_gauge = GaugeVec::new(
84 Opts::new(
85 "switchboard_function_network_call_gauge",
86 "Network Call Counter",
87 ),
88 &["chain", "chain_id", "queue_key", "oracle_key", "function"],
89 )
90 .unwrap();
91 prometheus::register(Box::new(network_call_gauge.clone())).unwrap();
92
93 let runtime_gauge = GaugeVec::new(
94 Opts::new(
95 "switchboard_function_runtime_gauge",
96 "Function Runtime Gauge",
97 ),
98 &[
99 "chain",
100 "chain_id",
101 "queue_key",
102 "oracle_key",
103 "function_key",
104 "function_request_key",
105 ],
106 )
107 .unwrap();
108 prometheus::register(Box::new(runtime_gauge.clone())).unwrap();
109
110 let fn_error_code_gauge = GaugeVec::new(
111 Opts::new(
112 "switchboard_function_error_code_gauge",
113 "Function Runtime Gauge",
114 ),
115 &[
116 "chain",
117 "chain_id",
118 "queue_key",
119 "oracle_key",
120 "function_key",
121 "function_request_key",
122 "code",
123 ],
124 )
125 .unwrap();
126 prometheus::register(Box::new(fn_error_code_gauge.clone())).unwrap();
127
128 let unhandled_error_counter = CounterVec::new(
129 Opts::new(
130 "switchboard_function_unhandled_error_counter",
131 "Function unhandled error counter",
132 ),
133 &[
134 "chain",
135 "chain_id",
136 "queue_key",
137 "oracle_key",
138 "function_key",
139 "function_request_key",
140 ],
141 )
142 .unwrap();
143 prometheus::register(Box::new(unhandled_error_counter.clone())).unwrap();
144
145 let fn_timeout_counter = CounterVec::new(
146 Opts::new(
147 "switchboard_function_timeout_counter",
148 "Function run timeout counter",
149 ),
150 &[
151 "chain",
152 "chain_id",
153 "queue_key",
154 "oracle_key",
155 "function_key",
156 ],
157 )
158 .unwrap();
159 prometheus::register(Box::new(fn_timeout_counter.clone())).unwrap();
160
161 let oracle_available_permits_gauge = GaugeVec::new(
162 Opts::new("switchboard_oracle_available_permits_gauge", "ph"),
163 &["chain", "chain_id", "queue_key", "oracle_key"],
164 )
165 .unwrap();
166 prometheus::register(Box::new(oracle_available_permits_gauge.clone())).unwrap();
167
168 let qvn_error_report_failed_counter = CounterVec::new(
169 Opts::new("switchboard_qvn_error_report_failed_counter", "ph"),
170 &[
171 "chain",
172 "chain_id",
173 "queue_key",
174 "oracle_key",
175 "function_key",
176 "function_request_key",
177 ],
178 )
179 .unwrap();
180 prometheus::register(Box::new(qvn_error_report_failed_counter.clone())).unwrap();
181
182 let oracle_img_dl_counter = CounterVec::new(
183 Opts::new("switchboard_img_dl_counter", "ph"),
184 &["chain", "chain_id", "queue_key", "oracle_key", "container"],
185 )
186 .unwrap();
187 prometheus::register(Box::new(oracle_img_dl_counter.clone())).unwrap();
188
189 let oracle_dl_routine_latency = GaugeVec::new(
190 Opts::new("switchboard_oracle_dl_routine_latency", "ph"),
191 &["chain", "chain_id", "queue_key", "oracle_key"],
192 )
193 .unwrap();
194 prometheus::register(Box::new(oracle_dl_routine_latency.clone())).unwrap();
195
196 let oracle_awaiter_routine_latency = GaugeVec::new(
197 Opts::new("switchboard_oracle_awaiter_routine_latency", "ph"),
198 &["chain", "chain_id", "queue_key", "oracle_key"],
199 )
200 .unwrap();
201 prometheus::register(Box::new(oracle_awaiter_routine_latency.clone())).unwrap();
202
203 let oracle_poller_latency = GaugeVec::new(
204 Opts::new("switchboard_oracle_poller_routine_latency", "ph"),
205 &["chain", "chain_id", "queue_key", "oracle_key"],
206 )
207 .unwrap();
208 prometheus::register(Box::new(oracle_poller_latency.clone())).unwrap();
209
210 let function_execution_histogram = HistogramVec::new(
211 opts!(
212 "function_execution_duration",
213 "Function execution duration in seconds"
214 )
215 .into(),
216 &["task_id"],
217 )
218 .expect("Failed to create function_execution_duration histogram");
219 prometheus::register(Box::new(function_execution_histogram.clone())).unwrap();
220
221 SwitchboardMetrics {
222 registry,
223 fn_backoff_counter,
224 fn_execution_stolen_counter,
225 request_counter,
226 boot_counter,
227 network_call_gauge,
228 runtime_gauge,
229 fn_error_code_gauge,
230 unhandled_error_counter,
231 fn_timeout_counter,
232 oracle_available_permits_gauge,
233 qvn_error_report_failed_counter,
234 oracle_img_dl_counter,
235 oracle_dl_routine_latency,
236 oracle_awaiter_routine_latency,
237 oracle_poller_latency,
238 function_execution_histogram,
239 }
240 }
241
242 pub fn record_fn_execution_duration(&self, task_id: &str, duration_secs: f64) {
243 self.function_execution_histogram
244 .with_label_values(&[task_id])
245 .observe(duration_secs);
246 }
247
248 }
250
251