1use std::sync::Arc;
4
5use crate::utils::NetworkState;
6
7use prometheus_client::{
8 encoding::EncodeLabelSet,
9 metrics::{
10 counter::Counter, family::Family, gauge::Gauge, histogram::Histogram,
11 },
12 registry::Registry,
13};
14
15#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
16struct DialAttemptLabels {
17 phase: &'static str,
18}
19
20#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
21struct DialFailureLabels {
22 phase: &'static str,
23 kind: &'static str,
24}
25
26#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
27struct MessageDropLabels {
28 direction: &'static str,
29 reason: &'static str,
30}
31
32#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
33struct ReqResMessageLabels {
34 kind: &'static str,
35}
36
37#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
38struct ReqResFailureLabels {
39 direction: &'static str,
40 kind: &'static str,
41}
42
43#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
44struct ErrorKindLabels {
45 kind: &'static str,
46}
47
48#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
49struct ControlListUpdateLabels {
50 list: &'static str,
51 result: &'static str,
52}
53
54#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
55struct ControlListLabels {
56 list: &'static str,
57}
58
59#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
60struct ControlListDeniedLabels {
61 reason: &'static str,
62}
63
64#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
65struct BootstrapDurationLabels {
66 result: &'static str,
67}
68
69#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
70struct NetworkStateLabels {
71 state: &'static str,
72}
73
74#[derive(Debug)]
76pub struct NetworkMetrics {
77 dial_attempts_total: Family<DialAttemptLabels, Counter>,
78 dial_failures_total: Family<DialFailureLabels, Counter>,
79 messages_dropped_total: Family<MessageDropLabels, Counter>,
80 reqres_messages_received_total: Family<ReqResMessageLabels, Counter>,
81 reqres_failures_total: Family<ReqResFailureLabels, Counter>,
82 identify_errors_total: Family<ErrorKindLabels, Counter>,
83 control_list_updates_total: Family<ControlListUpdateLabels, Counter>,
84 control_list_apply_total: Family<ControlListLabels, Counter>,
85 control_list_denied_total: Family<ControlListDeniedLabels, Counter>,
86 retry_queue_len: Gauge,
87 pending_outbound_peers: Gauge,
88 pending_outbound_messages: Gauge,
89 pending_outbound_bytes: Gauge,
90 pending_inbound_peers: Gauge,
91 pending_inbound_messages: Gauge,
92 pending_inbound_bytes: Gauge,
93 identified_peers: Gauge,
94 response_channels_pending: Gauge,
95 control_list_allow_last_success_age_seconds: Gauge,
96 control_list_block_last_success_age_seconds: Gauge,
97 control_list_allow_peers: Gauge,
98 control_list_block_peers: Gauge,
99 state: Family<NetworkStateLabels, Gauge>,
100 bootstrap_duration_seconds:
101 Family<BootstrapDurationLabels, Histogram, fn() -> Histogram>,
102 pending_message_age_seconds: Histogram,
103 control_list_updater_duration_seconds: Histogram,
104}
105
106impl NetworkMetrics {
107 fn new() -> Self {
108 Self {
109 dial_attempts_total: Family::default(),
110 dial_failures_total: Family::default(),
111 messages_dropped_total: Family::default(),
112 reqres_messages_received_total: Family::default(),
113 reqres_failures_total: Family::default(),
114 identify_errors_total: Family::default(),
115 control_list_updates_total: Family::default(),
116 control_list_apply_total: Family::default(),
117 control_list_denied_total: Family::default(),
118 retry_queue_len: Gauge::default(),
119 pending_outbound_peers: Gauge::default(),
120 pending_outbound_messages: Gauge::default(),
121 pending_outbound_bytes: Gauge::default(),
122 pending_inbound_peers: Gauge::default(),
123 pending_inbound_messages: Gauge::default(),
124 pending_inbound_bytes: Gauge::default(),
125 identified_peers: Gauge::default(),
126 response_channels_pending: Gauge::default(),
127 control_list_allow_last_success_age_seconds: Gauge::default(),
128 control_list_block_last_success_age_seconds: Gauge::default(),
129 control_list_allow_peers: Gauge::default(),
130 control_list_block_peers: Gauge::default(),
131 state: Family::default(),
132 bootstrap_duration_seconds: Family::new_with_constructor(|| {
133 Histogram::new(vec![
134 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 40.0, 80.0,
135 ])
136 }),
137 pending_message_age_seconds: Histogram::new(vec![
138 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0,
139 ]),
140 control_list_updater_duration_seconds: Histogram::new(vec![
141 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0,
142 ]),
143 }
144 }
145
146 fn register_into(&self, registry: &mut Registry) {
147 registry.register(
148 "network_dial_attempts",
149 "Total dial attempts, labeled by phase.",
150 self.dial_attempts_total.clone(),
151 );
152 registry.register(
153 "network_dial_failures",
154 "Total dial failures, labeled by phase and kind.",
155 self.dial_failures_total.clone(),
156 );
157 registry.register(
158 "network_messages_dropped",
159 "Total dropped or rejected messages, labeled by direction and reason.",
160 self.messages_dropped_total.clone(),
161 );
162 registry.register(
163 "network_reqres_messages_received",
164 "Total request-response messages received, labeled by kind.",
165 self.reqres_messages_received_total.clone(),
166 );
167 registry.register(
168 "network_reqres_failures",
169 "Total request-response failures, labeled by direction and kind.",
170 self.reqres_failures_total.clone(),
171 );
172 registry.register(
173 "network_identify_errors",
174 "Total identify protocol errors, labeled by kind.",
175 self.identify_errors_total.clone(),
176 );
177 registry.register(
178 "network_control_list_updates",
179 "Total control-list update attempts, labeled by list and result.",
180 self.control_list_updates_total.clone(),
181 );
182 registry.register(
183 "network_control_list_apply",
184 "Total applied control-list updates, labeled by list.",
185 self.control_list_apply_total.clone(),
186 );
187 registry.register(
188 "network_control_list_denied",
189 "Total denied connections by control list, labeled by reason.",
190 self.control_list_denied_total.clone(),
191 );
192 registry.register(
193 "network_retry_queue_len",
194 "Current retry queue length.",
195 self.retry_queue_len.clone(),
196 );
197 registry.register(
198 "network_pending_outbound_peers",
199 "Peers with pending outbound messages.",
200 self.pending_outbound_peers.clone(),
201 );
202 registry.register(
203 "network_pending_outbound_messages",
204 "Total pending outbound messages.",
205 self.pending_outbound_messages.clone(),
206 );
207 registry.register(
208 "network_pending_outbound_bytes",
209 "Total pending outbound payload bytes.",
210 self.pending_outbound_bytes.clone(),
211 );
212 registry.register(
213 "network_pending_inbound_peers",
214 "Peers with pending inbound messages.",
215 self.pending_inbound_peers.clone(),
216 );
217 registry.register(
218 "network_pending_inbound_messages",
219 "Total pending inbound messages.",
220 self.pending_inbound_messages.clone(),
221 );
222 registry.register(
223 "network_pending_inbound_bytes",
224 "Total pending inbound payload bytes.",
225 self.pending_inbound_bytes.clone(),
226 );
227 registry.register(
228 "network_identified_peers",
229 "Current number of identified peers.",
230 self.identified_peers.clone(),
231 );
232 registry.register(
233 "network_response_channels_pending",
234 "Current number of pending request-response channels.",
235 self.response_channels_pending.clone(),
236 );
237 registry.register(
238 "network_control_list_allow_last_success_age_seconds",
239 "Seconds since last successful allow-list update (-1 if never).",
240 self.control_list_allow_last_success_age_seconds.clone(),
241 );
242 registry.register(
243 "network_control_list_block_last_success_age_seconds",
244 "Seconds since last successful block-list update (-1 if never).",
245 self.control_list_block_last_success_age_seconds.clone(),
246 );
247 registry.register(
248 "network_control_list_allow_peers",
249 "Current number of peers in allow list.",
250 self.control_list_allow_peers.clone(),
251 );
252 registry.register(
253 "network_control_list_block_peers",
254 "Current number of peers in block list.",
255 self.control_list_block_peers.clone(),
256 );
257 registry.register(
258 "network_state",
259 "Current network state as one-hot gauges labeled by state.",
260 self.state.clone(),
261 );
262 registry.register(
263 "network_bootstrap_duration_seconds",
264 "Bootstrap connection duration in seconds, labeled by result.",
265 self.bootstrap_duration_seconds.clone(),
266 );
267 registry.register(
268 "network_pending_message_age_seconds",
269 "Age of pending messages when they leave queue or are dropped.",
270 self.pending_message_age_seconds.clone(),
271 );
272 registry.register(
273 "network_control_list_updater_duration_seconds",
274 "Control-list updater duration in seconds.",
275 self.control_list_updater_duration_seconds.clone(),
276 );
277 }
278
279 pub(crate) fn inc_dial_attempt_bootstrap(&self) {
280 self.dial_attempts_total
281 .get_or_create(&DialAttemptLabels { phase: "bootstrap" })
282 .inc();
283 }
284
285 pub(crate) fn inc_dial_attempt_runtime(&self) {
286 self.dial_attempts_total
287 .get_or_create(&DialAttemptLabels { phase: "runtime" })
288 .inc();
289 }
290
291 pub(crate) fn observe_dial_failure(
292 &self,
293 phase: &'static str,
294 kind: &'static str,
295 ) {
296 self.dial_failures_total
297 .get_or_create(&DialFailureLabels { phase, kind })
298 .inc();
299 }
300
301 pub(crate) fn inc_outbound_queue_drop_by(&self, count: u64) {
302 if count > 0 {
303 self.messages_dropped_total
304 .get_or_create(&MessageDropLabels {
305 direction: "outbound",
306 reason: "queue_limit",
307 })
308 .inc_by(count);
309 }
310 }
311
312 pub(crate) fn inc_inbound_queue_drop_by(&self, count: u64) {
313 if count > 0 {
314 self.messages_dropped_total
315 .get_or_create(&MessageDropLabels {
316 direction: "inbound",
317 reason: "queue_limit",
318 })
319 .inc_by(count);
320 }
321 }
322
323 pub(crate) fn inc_outbound_queue_bytes_drop_per_peer_by(&self, count: u64) {
324 if count > 0 {
325 self.messages_dropped_total
326 .get_or_create(&MessageDropLabels {
327 direction: "outbound",
328 reason: "queue_bytes_limit_per_peer",
329 })
330 .inc_by(count);
331 }
332 }
333
334 pub(crate) fn inc_outbound_queue_bytes_drop_global_by(&self, count: u64) {
335 if count > 0 {
336 self.messages_dropped_total
337 .get_or_create(&MessageDropLabels {
338 direction: "outbound",
339 reason: "queue_bytes_limit_global",
340 })
341 .inc_by(count);
342 }
343 }
344
345 pub(crate) fn inc_inbound_queue_bytes_drop_per_peer_by(&self, count: u64) {
346 if count > 0 {
347 self.messages_dropped_total
348 .get_or_create(&MessageDropLabels {
349 direction: "inbound",
350 reason: "queue_bytes_limit_per_peer",
351 })
352 .inc_by(count);
353 }
354 }
355
356 pub(crate) fn inc_inbound_queue_bytes_drop_global_by(&self, count: u64) {
357 if count > 0 {
358 self.messages_dropped_total
359 .get_or_create(&MessageDropLabels {
360 direction: "inbound",
361 reason: "queue_bytes_limit_global",
362 })
363 .inc_by(count);
364 }
365 }
366
367 pub(crate) fn inc_max_retries_drop_by(&self, count: u64) {
368 if count > 0 {
369 self.messages_dropped_total
370 .get_or_create(&MessageDropLabels {
371 direction: "outbound",
372 reason: "max_retries",
373 })
374 .inc_by(count);
375 }
376 }
377
378 pub(crate) fn inc_oversized_inbound_drop(&self) {
379 self.messages_dropped_total
380 .get_or_create(&MessageDropLabels {
381 direction: "inbound",
382 reason: "oversized",
383 })
384 .inc();
385 }
386
387 pub(crate) fn inc_oversized_outbound_drop(&self) {
388 self.messages_dropped_total
389 .get_or_create(&MessageDropLabels {
390 direction: "outbound",
391 reason: "oversized",
392 })
393 .inc();
394 }
395
396 pub(crate) fn inc_reqres_request_received(&self) {
397 self.reqres_messages_received_total
398 .get_or_create(&ReqResMessageLabels { kind: "request" })
399 .inc();
400 }
401
402 pub(crate) fn inc_reqres_response_received(&self) {
403 self.reqres_messages_received_total
404 .get_or_create(&ReqResMessageLabels { kind: "response" })
405 .inc();
406 }
407
408 pub(crate) fn observe_reqres_failure(
409 &self,
410 direction: &'static str,
411 kind: &'static str,
412 ) {
413 self.reqres_failures_total
414 .get_or_create(&ReqResFailureLabels { direction, kind })
415 .inc();
416 }
417
418 pub(crate) fn observe_identify_error(&self, kind: &'static str) {
419 self.identify_errors_total
420 .get_or_create(&ErrorKindLabels { kind })
421 .inc();
422 }
423
424 pub(crate) fn observe_control_list_denied(&self, reason: &'static str) {
425 self.control_list_denied_total
426 .get_or_create(&ControlListDeniedLabels { reason })
427 .inc();
428 }
429
430 pub(crate) fn observe_control_list_allow_update(&self, success: bool) {
431 let result = if success { "success" } else { "failure" };
432 self.control_list_updates_total
433 .get_or_create(&ControlListUpdateLabels {
434 list: "allow",
435 result,
436 })
437 .inc();
438 }
439
440 pub(crate) fn observe_control_list_block_update(&self, success: bool) {
441 let result = if success { "success" } else { "failure" };
442 self.control_list_updates_total
443 .get_or_create(&ControlListUpdateLabels {
444 list: "block",
445 result,
446 })
447 .inc();
448 }
449
450 pub(crate) fn inc_control_list_allow_apply(&self) {
451 self.control_list_apply_total
452 .get_or_create(&ControlListLabels { list: "allow" })
453 .inc();
454 }
455
456 pub(crate) fn inc_control_list_block_apply(&self) {
457 self.control_list_apply_total
458 .get_or_create(&ControlListLabels { list: "block" })
459 .inc();
460 }
461
462 pub(crate) fn set_control_list_allow_last_success_age_seconds(
463 &self,
464 value: i64,
465 ) {
466 self.control_list_allow_last_success_age_seconds.set(value);
467 }
468
469 pub(crate) fn set_control_list_block_last_success_age_seconds(
470 &self,
471 value: i64,
472 ) {
473 self.control_list_block_last_success_age_seconds.set(value);
474 }
475
476 pub(crate) fn set_control_list_allow_peers(&self, value: i64) {
477 self.control_list_allow_peers.set(value);
478 }
479
480 pub(crate) fn set_control_list_block_peers(&self, value: i64) {
481 self.control_list_block_peers.set(value);
482 }
483
484 pub(crate) fn observe_control_list_updater_duration_seconds(
485 &self,
486 seconds: f64,
487 ) {
488 self.control_list_updater_duration_seconds.observe(seconds);
489 }
490
491 pub(crate) fn set_state_current(&self, state: &NetworkState) {
492 let current = Self::state_label(state);
493 for known in Self::state_labels() {
494 self.state
495 .get_or_create(&NetworkStateLabels { state: known })
496 .set((known == current) as i64);
497 }
498 }
499
500 pub(crate) fn observe_state_transition(&self, state: &NetworkState) {
501 self.set_state_current(state);
502 }
503
504 pub(crate) fn observe_pending_message_age_seconds(&self, age_seconds: f64) {
505 self.pending_message_age_seconds.observe(age_seconds);
506 }
507
508 pub(crate) fn set_retry_queue_len(&self, value: i64) {
509 self.retry_queue_len.set(value);
510 }
511
512 pub(crate) fn set_pending_outbound_peers(&self, value: i64) {
513 self.pending_outbound_peers.set(value);
514 }
515
516 pub(crate) fn set_pending_outbound_messages(&self, value: i64) {
517 self.pending_outbound_messages.set(value);
518 }
519
520 pub(crate) fn set_pending_outbound_bytes(&self, value: i64) {
521 self.pending_outbound_bytes.set(value);
522 }
523
524 pub(crate) fn set_pending_inbound_peers(&self, value: i64) {
525 self.pending_inbound_peers.set(value);
526 }
527
528 pub(crate) fn set_pending_inbound_messages(&self, value: i64) {
529 self.pending_inbound_messages.set(value);
530 }
531
532 pub(crate) fn set_pending_inbound_bytes(&self, value: i64) {
533 self.pending_inbound_bytes.set(value);
534 }
535
536 pub(crate) fn set_identified_peers(&self, value: i64) {
537 self.identified_peers.set(value);
538 }
539
540 pub(crate) fn set_response_channels_pending(&self, value: i64) {
541 self.response_channels_pending.set(value);
542 }
543
544 pub(crate) fn observe_bootstrap_duration_seconds(
545 &self,
546 result: &'static str,
547 seconds: f64,
548 ) {
549 self.bootstrap_duration_seconds
550 .get_or_create(&BootstrapDurationLabels { result })
551 .observe(seconds);
552 }
553
554 const fn state_labels() -> [&'static str; 5] {
555 ["start", "dial", "dialing", "running", "disconnected"]
556 }
557
558 const fn state_label(state: &NetworkState) -> &'static str {
559 match state {
560 NetworkState::Start => "start",
561 NetworkState::Dial => "dial",
562 NetworkState::Dialing => "dialing",
563 NetworkState::Running => "running",
564 NetworkState::Disconnected => "disconnected",
565 }
566 }
567}
568
569pub fn register(registry: &mut Registry) -> Arc<NetworkMetrics> {
573 let metrics = Arc::new(NetworkMetrics::new());
574 metrics.register_into(registry);
575 metrics
576}