Skip to main content

ave_network/
metrics.rs

1//! Prometheus metrics for the network worker.
2
3use std::sync::Arc;
4
5use crate::utils::NetworkState;
6
7use prometheus_client::{
8    encoding::EncodeLabelSet,
9    metrics::{
10        counter::Counter, family::Family, gauge::Gauge, histogram::Histogram,
11    },
12    registry::Registry,
13};
14
15#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
16struct DialAttemptLabels {
17    phase: &'static str,
18}
19
20#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
21struct DialFailureLabels {
22    phase: &'static str,
23    kind: &'static str,
24}
25
26#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
27struct MessageDropLabels {
28    direction: &'static str,
29    reason: &'static str,
30}
31
32#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
33struct ReqResMessageLabels {
34    kind: &'static str,
35}
36
37#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
38struct ReqResFailureLabels {
39    direction: &'static str,
40    kind: &'static str,
41}
42
43#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
44struct ErrorKindLabels {
45    kind: &'static str,
46}
47
48#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
49struct ControlListUpdateLabels {
50    list: &'static str,
51    result: &'static str,
52}
53
54#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
55struct ControlListLabels {
56    list: &'static str,
57}
58
59#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
60struct ControlListDeniedLabels {
61    reason: &'static str,
62}
63
64#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
65struct BootstrapDurationLabels {
66    result: &'static str,
67}
68
69#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
70struct NetworkStateLabels {
71    state: &'static str,
72}
73
74/// Metrics handle used by the network worker.
75#[derive(Debug)]
76pub struct NetworkMetrics {
77    dial_attempts_total: Family<DialAttemptLabels, Counter>,
78    dial_failures_total: Family<DialFailureLabels, Counter>,
79    messages_dropped_total: Family<MessageDropLabels, Counter>,
80    reqres_messages_received_total: Family<ReqResMessageLabels, Counter>,
81    reqres_failures_total: Family<ReqResFailureLabels, Counter>,
82    identify_errors_total: Family<ErrorKindLabels, Counter>,
83    control_list_updates_total: Family<ControlListUpdateLabels, Counter>,
84    control_list_apply_total: Family<ControlListLabels, Counter>,
85    control_list_denied_total: Family<ControlListDeniedLabels, Counter>,
86    retry_queue_len: Gauge,
87    pending_outbound_peers: Gauge,
88    pending_outbound_messages: Gauge,
89    pending_outbound_bytes: Gauge,
90    pending_inbound_peers: Gauge,
91    pending_inbound_messages: Gauge,
92    pending_inbound_bytes: Gauge,
93    identified_peers: Gauge,
94    response_channels_pending: Gauge,
95    control_list_allow_last_success_age_seconds: Gauge,
96    control_list_block_last_success_age_seconds: Gauge,
97    control_list_allow_peers: Gauge,
98    control_list_block_peers: Gauge,
99    state: Family<NetworkStateLabels, Gauge>,
100    bootstrap_duration_seconds:
101        Family<BootstrapDurationLabels, Histogram, fn() -> Histogram>,
102    pending_message_age_seconds: Histogram,
103    control_list_updater_duration_seconds: Histogram,
104}
105
106impl NetworkMetrics {
107    fn new() -> Self {
108        Self {
109            dial_attempts_total: Family::default(),
110            dial_failures_total: Family::default(),
111            messages_dropped_total: Family::default(),
112            reqres_messages_received_total: Family::default(),
113            reqres_failures_total: Family::default(),
114            identify_errors_total: Family::default(),
115            control_list_updates_total: Family::default(),
116            control_list_apply_total: Family::default(),
117            control_list_denied_total: Family::default(),
118            retry_queue_len: Gauge::default(),
119            pending_outbound_peers: Gauge::default(),
120            pending_outbound_messages: Gauge::default(),
121            pending_outbound_bytes: Gauge::default(),
122            pending_inbound_peers: Gauge::default(),
123            pending_inbound_messages: Gauge::default(),
124            pending_inbound_bytes: Gauge::default(),
125            identified_peers: Gauge::default(),
126            response_channels_pending: Gauge::default(),
127            control_list_allow_last_success_age_seconds: Gauge::default(),
128            control_list_block_last_success_age_seconds: Gauge::default(),
129            control_list_allow_peers: Gauge::default(),
130            control_list_block_peers: Gauge::default(),
131            state: Family::default(),
132            bootstrap_duration_seconds: Family::new_with_constructor(|| {
133                Histogram::new(vec![
134                    0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 40.0, 80.0,
135                ])
136            }),
137            pending_message_age_seconds: Histogram::new(vec![
138                0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0,
139            ]),
140            control_list_updater_duration_seconds: Histogram::new(vec![
141                0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0,
142            ]),
143        }
144    }
145
146    fn register_into(&self, registry: &mut Registry) {
147        registry.register(
148            "network_dial_attempts",
149            "Total dial attempts, labeled by phase.",
150            self.dial_attempts_total.clone(),
151        );
152        registry.register(
153            "network_dial_failures",
154            "Total dial failures, labeled by phase and kind.",
155            self.dial_failures_total.clone(),
156        );
157        registry.register(
158            "network_messages_dropped",
159            "Total dropped or rejected messages, labeled by direction and reason.",
160            self.messages_dropped_total.clone(),
161        );
162        registry.register(
163            "network_reqres_messages_received",
164            "Total request-response messages received, labeled by kind.",
165            self.reqres_messages_received_total.clone(),
166        );
167        registry.register(
168            "network_reqres_failures",
169            "Total request-response failures, labeled by direction and kind.",
170            self.reqres_failures_total.clone(),
171        );
172        registry.register(
173            "network_identify_errors",
174            "Total identify protocol errors, labeled by kind.",
175            self.identify_errors_total.clone(),
176        );
177        registry.register(
178            "network_control_list_updates",
179            "Total control-list update attempts, labeled by list and result.",
180            self.control_list_updates_total.clone(),
181        );
182        registry.register(
183            "network_control_list_apply",
184            "Total applied control-list updates, labeled by list.",
185            self.control_list_apply_total.clone(),
186        );
187        registry.register(
188            "network_control_list_denied",
189            "Total denied connections by control list, labeled by reason.",
190            self.control_list_denied_total.clone(),
191        );
192        registry.register(
193            "network_retry_queue_len",
194            "Current retry queue length.",
195            self.retry_queue_len.clone(),
196        );
197        registry.register(
198            "network_pending_outbound_peers",
199            "Peers with pending outbound messages.",
200            self.pending_outbound_peers.clone(),
201        );
202        registry.register(
203            "network_pending_outbound_messages",
204            "Total pending outbound messages.",
205            self.pending_outbound_messages.clone(),
206        );
207        registry.register(
208            "network_pending_outbound_bytes",
209            "Total pending outbound payload bytes.",
210            self.pending_outbound_bytes.clone(),
211        );
212        registry.register(
213            "network_pending_inbound_peers",
214            "Peers with pending inbound messages.",
215            self.pending_inbound_peers.clone(),
216        );
217        registry.register(
218            "network_pending_inbound_messages",
219            "Total pending inbound messages.",
220            self.pending_inbound_messages.clone(),
221        );
222        registry.register(
223            "network_pending_inbound_bytes",
224            "Total pending inbound payload bytes.",
225            self.pending_inbound_bytes.clone(),
226        );
227        registry.register(
228            "network_identified_peers",
229            "Current number of identified peers.",
230            self.identified_peers.clone(),
231        );
232        registry.register(
233            "network_response_channels_pending",
234            "Current number of pending request-response channels.",
235            self.response_channels_pending.clone(),
236        );
237        registry.register(
238            "network_control_list_allow_last_success_age_seconds",
239            "Seconds since last successful allow-list update (-1 if never).",
240            self.control_list_allow_last_success_age_seconds.clone(),
241        );
242        registry.register(
243            "network_control_list_block_last_success_age_seconds",
244            "Seconds since last successful block-list update (-1 if never).",
245            self.control_list_block_last_success_age_seconds.clone(),
246        );
247        registry.register(
248            "network_control_list_allow_peers",
249            "Current number of peers in allow list.",
250            self.control_list_allow_peers.clone(),
251        );
252        registry.register(
253            "network_control_list_block_peers",
254            "Current number of peers in block list.",
255            self.control_list_block_peers.clone(),
256        );
257        registry.register(
258            "network_state",
259            "Current network state as one-hot gauges labeled by state.",
260            self.state.clone(),
261        );
262        registry.register(
263            "network_bootstrap_duration_seconds",
264            "Bootstrap connection duration in seconds, labeled by result.",
265            self.bootstrap_duration_seconds.clone(),
266        );
267        registry.register(
268            "network_pending_message_age_seconds",
269            "Age of pending messages when they leave queue or are dropped.",
270            self.pending_message_age_seconds.clone(),
271        );
272        registry.register(
273            "network_control_list_updater_duration_seconds",
274            "Control-list updater duration in seconds.",
275            self.control_list_updater_duration_seconds.clone(),
276        );
277    }
278
279    pub(crate) fn inc_dial_attempt_bootstrap(&self) {
280        self.dial_attempts_total
281            .get_or_create(&DialAttemptLabels { phase: "bootstrap" })
282            .inc();
283    }
284
285    pub(crate) fn inc_dial_attempt_runtime(&self) {
286        self.dial_attempts_total
287            .get_or_create(&DialAttemptLabels { phase: "runtime" })
288            .inc();
289    }
290
291    pub(crate) fn observe_dial_failure(
292        &self,
293        phase: &'static str,
294        kind: &'static str,
295    ) {
296        self.dial_failures_total
297            .get_or_create(&DialFailureLabels { phase, kind })
298            .inc();
299    }
300
301    pub(crate) fn inc_outbound_queue_drop_by(&self, count: u64) {
302        if count > 0 {
303            self.messages_dropped_total
304                .get_or_create(&MessageDropLabels {
305                    direction: "outbound",
306                    reason: "queue_limit",
307                })
308                .inc_by(count);
309        }
310    }
311
312    pub(crate) fn inc_inbound_queue_drop_by(&self, count: u64) {
313        if count > 0 {
314            self.messages_dropped_total
315                .get_or_create(&MessageDropLabels {
316                    direction: "inbound",
317                    reason: "queue_limit",
318                })
319                .inc_by(count);
320        }
321    }
322
323    pub(crate) fn inc_outbound_queue_bytes_drop_per_peer_by(&self, count: u64) {
324        if count > 0 {
325            self.messages_dropped_total
326                .get_or_create(&MessageDropLabels {
327                    direction: "outbound",
328                    reason: "queue_bytes_limit_per_peer",
329                })
330                .inc_by(count);
331        }
332    }
333
334    pub(crate) fn inc_outbound_queue_bytes_drop_global_by(&self, count: u64) {
335        if count > 0 {
336            self.messages_dropped_total
337                .get_or_create(&MessageDropLabels {
338                    direction: "outbound",
339                    reason: "queue_bytes_limit_global",
340                })
341                .inc_by(count);
342        }
343    }
344
345    pub(crate) fn inc_inbound_queue_bytes_drop_per_peer_by(&self, count: u64) {
346        if count > 0 {
347            self.messages_dropped_total
348                .get_or_create(&MessageDropLabels {
349                    direction: "inbound",
350                    reason: "queue_bytes_limit_per_peer",
351                })
352                .inc_by(count);
353        }
354    }
355
356    pub(crate) fn inc_inbound_queue_bytes_drop_global_by(&self, count: u64) {
357        if count > 0 {
358            self.messages_dropped_total
359                .get_or_create(&MessageDropLabels {
360                    direction: "inbound",
361                    reason: "queue_bytes_limit_global",
362                })
363                .inc_by(count);
364        }
365    }
366
367    pub(crate) fn inc_max_retries_drop_by(&self, count: u64) {
368        if count > 0 {
369            self.messages_dropped_total
370                .get_or_create(&MessageDropLabels {
371                    direction: "outbound",
372                    reason: "max_retries",
373                })
374                .inc_by(count);
375        }
376    }
377
378    pub(crate) fn inc_oversized_inbound_drop(&self) {
379        self.messages_dropped_total
380            .get_or_create(&MessageDropLabels {
381                direction: "inbound",
382                reason: "oversized",
383            })
384            .inc();
385    }
386
387    pub(crate) fn inc_oversized_outbound_drop(&self) {
388        self.messages_dropped_total
389            .get_or_create(&MessageDropLabels {
390                direction: "outbound",
391                reason: "oversized",
392            })
393            .inc();
394    }
395
396    pub(crate) fn inc_reqres_request_received(&self) {
397        self.reqres_messages_received_total
398            .get_or_create(&ReqResMessageLabels { kind: "request" })
399            .inc();
400    }
401
402    pub(crate) fn inc_reqres_response_received(&self) {
403        self.reqres_messages_received_total
404            .get_or_create(&ReqResMessageLabels { kind: "response" })
405            .inc();
406    }
407
408    pub(crate) fn observe_reqres_failure(
409        &self,
410        direction: &'static str,
411        kind: &'static str,
412    ) {
413        self.reqres_failures_total
414            .get_or_create(&ReqResFailureLabels { direction, kind })
415            .inc();
416    }
417
418    pub(crate) fn observe_identify_error(&self, kind: &'static str) {
419        self.identify_errors_total
420            .get_or_create(&ErrorKindLabels { kind })
421            .inc();
422    }
423
424    pub(crate) fn observe_control_list_denied(&self, reason: &'static str) {
425        self.control_list_denied_total
426            .get_or_create(&ControlListDeniedLabels { reason })
427            .inc();
428    }
429
430    pub(crate) fn observe_control_list_allow_update(&self, success: bool) {
431        let result = if success { "success" } else { "failure" };
432        self.control_list_updates_total
433            .get_or_create(&ControlListUpdateLabels {
434                list: "allow",
435                result,
436            })
437            .inc();
438    }
439
440    pub(crate) fn observe_control_list_block_update(&self, success: bool) {
441        let result = if success { "success" } else { "failure" };
442        self.control_list_updates_total
443            .get_or_create(&ControlListUpdateLabels {
444                list: "block",
445                result,
446            })
447            .inc();
448    }
449
450    pub(crate) fn inc_control_list_allow_apply(&self) {
451        self.control_list_apply_total
452            .get_or_create(&ControlListLabels { list: "allow" })
453            .inc();
454    }
455
456    pub(crate) fn inc_control_list_block_apply(&self) {
457        self.control_list_apply_total
458            .get_or_create(&ControlListLabels { list: "block" })
459            .inc();
460    }
461
462    pub(crate) fn set_control_list_allow_last_success_age_seconds(
463        &self,
464        value: i64,
465    ) {
466        self.control_list_allow_last_success_age_seconds.set(value);
467    }
468
469    pub(crate) fn set_control_list_block_last_success_age_seconds(
470        &self,
471        value: i64,
472    ) {
473        self.control_list_block_last_success_age_seconds.set(value);
474    }
475
476    pub(crate) fn set_control_list_allow_peers(&self, value: i64) {
477        self.control_list_allow_peers.set(value);
478    }
479
480    pub(crate) fn set_control_list_block_peers(&self, value: i64) {
481        self.control_list_block_peers.set(value);
482    }
483
484    pub(crate) fn observe_control_list_updater_duration_seconds(
485        &self,
486        seconds: f64,
487    ) {
488        self.control_list_updater_duration_seconds.observe(seconds);
489    }
490
491    pub(crate) fn set_state_current(&self, state: &NetworkState) {
492        let current = Self::state_label(state);
493        for known in Self::state_labels() {
494            self.state
495                .get_or_create(&NetworkStateLabels { state: known })
496                .set((known == current) as i64);
497        }
498    }
499
500    pub(crate) fn observe_state_transition(&self, state: &NetworkState) {
501        self.set_state_current(state);
502    }
503
504    pub(crate) fn observe_pending_message_age_seconds(&self, age_seconds: f64) {
505        self.pending_message_age_seconds.observe(age_seconds);
506    }
507
508    pub(crate) fn set_retry_queue_len(&self, value: i64) {
509        self.retry_queue_len.set(value);
510    }
511
512    pub(crate) fn set_pending_outbound_peers(&self, value: i64) {
513        self.pending_outbound_peers.set(value);
514    }
515
516    pub(crate) fn set_pending_outbound_messages(&self, value: i64) {
517        self.pending_outbound_messages.set(value);
518    }
519
520    pub(crate) fn set_pending_outbound_bytes(&self, value: i64) {
521        self.pending_outbound_bytes.set(value);
522    }
523
524    pub(crate) fn set_pending_inbound_peers(&self, value: i64) {
525        self.pending_inbound_peers.set(value);
526    }
527
528    pub(crate) fn set_pending_inbound_messages(&self, value: i64) {
529        self.pending_inbound_messages.set(value);
530    }
531
532    pub(crate) fn set_pending_inbound_bytes(&self, value: i64) {
533        self.pending_inbound_bytes.set(value);
534    }
535
536    pub(crate) fn set_identified_peers(&self, value: i64) {
537        self.identified_peers.set(value);
538    }
539
540    pub(crate) fn set_response_channels_pending(&self, value: i64) {
541        self.response_channels_pending.set(value);
542    }
543
544    pub(crate) fn observe_bootstrap_duration_seconds(
545        &self,
546        result: &'static str,
547        seconds: f64,
548    ) {
549        self.bootstrap_duration_seconds
550            .get_or_create(&BootstrapDurationLabels { result })
551            .observe(seconds);
552    }
553
554    const fn state_labels() -> [&'static str; 5] {
555        ["start", "dial", "dialing", "running", "disconnected"]
556    }
557
558    const fn state_label(state: &NetworkState) -> &'static str {
559        match state {
560            NetworkState::Start => "start",
561            NetworkState::Dial => "dial",
562            NetworkState::Dialing => "dialing",
563            NetworkState::Running => "running",
564            NetworkState::Disconnected => "disconnected",
565        }
566    }
567}
568
569/// Register network metrics in the provided Prometheus registry.
570///
571/// Returns a shared handle that can be passed to the network worker.
572pub fn register(registry: &mut Registry) -> Arc<NetworkMetrics> {
573    let metrics = Arc::new(NetworkMetrics::new());
574    metrics.register_into(registry);
575    metrics
576}