Skip to main content

solti_prometheus/
discover.rs

1//! Discovery heartbeat metrics: Prometheus implementation of [`solti_discover::DiscoverMetricsBackend`].
2
3use std::sync::Arc;
4use std::time::{SystemTime, UNIX_EPOCH};
5
6use prometheus::{Counter, CounterVec, Gauge, Histogram, HistogramVec, Registry};
7use solti_discover::{DiscoverMetricsBackend, OUTCOME_FAILURE, OUTCOME_SUCCESS};
8
9use crate::register::{Sub, ms_to_secs};
10
11/// Prometheus implementation of [`DiscoverMetricsBackend`].
12///
13/// ## Metrics (`solti_discover_*`)
14///
15/// | Metric                                          | Type      | Labels    | Description                         |
16/// |-------------------------------------------------|-----------|-----------|-------------------------------------|
17/// | `solti_discover_attempts_total`                 | Counter   | -         | Total sync attempts                 |
18/// | `solti_discover_outcomes_total`                 | Counter   | `outcome` | Outcomes (`success` / `failure`)    |
19/// | `solti_discover_duration_seconds`               | Histogram | `outcome` | Sync call duration                  |
20/// | `solti_discover_failures_total`                 | Counter   | `reason`  | Failures grouped by reason          |
21/// | `solti_discover_last_success_timestamp_seconds` | Gauge     | -         | UNIX time of last successful sync   |
22/// | `solti_discover_holds_total`                    | Counter   | -         | Server-advised retry holds received |
23/// | `solti_discover_hold_duration_seconds`          | Histogram | -         | Duration of advised holds           |
24pub struct PrometheusDiscoverMetrics {
25    attempts_total: Counter,
26    outcomes_total: CounterVec,
27    duration_seconds: HistogramVec,
28    failures_total: CounterVec,
29    last_success_ts: Gauge,
30    holds_total: Counter,
31    hold_duration_seconds: Histogram,
32}
33
34impl PrometheusDiscoverMetrics {
35    /// Register all discovery metrics into `registry`.
36    pub fn new(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
37        let r = Sub::new(&registry, "discover");
38
39        let attempts_total = r.counter("attempts_total", "Total discovery heartbeat attempts")?;
40        let outcomes_total = r.counter_vec(
41            "outcomes_total",
42            "Discovery heartbeat outcomes",
43            &["outcome"],
44        )?;
45        let duration_seconds = r.histogram_vec(
46            "duration_seconds",
47            "Discovery heartbeat call duration",
48            vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0],
49            &["outcome"],
50        )?;
51        let failures_total = r.counter_vec(
52            "failures_total",
53            "Discovery heartbeat failures grouped by reason",
54            &["reason"],
55        )?;
56        let last_success_ts = r.gauge(
57            "last_success_timestamp_seconds",
58            "UNIX timestamp of the last successful heartbeat",
59        )?;
60        let holds_total = r.counter("holds_total", "Server-advised retry holds observed")?;
61        let hold_duration_seconds = r.histogram(
62            "hold_duration_seconds",
63            "Duration of server-advised retry holds",
64            vec![1.0, 5.0, 15.0, 30.0, 60.0, 300.0, 900.0, 1800.0, 3600.0],
65        )?;
66
67        Ok(Self {
68            attempts_total,
69            outcomes_total,
70            duration_seconds,
71            failures_total,
72            last_success_ts,
73            holds_total,
74            hold_duration_seconds,
75        })
76    }
77}
78
79impl std::fmt::Debug for PrometheusDiscoverMetrics {
80    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
81        f.debug_struct("PrometheusDiscoverMetrics").finish()
82    }
83}
84
85impl DiscoverMetricsBackend for PrometheusDiscoverMetrics {
86    fn record_attempt(&self) {
87        self.attempts_total.inc();
88    }
89
90    fn record_success(&self, duration_ms: u64) {
91        self.outcomes_total
92            .with_label_values(&[OUTCOME_SUCCESS])
93            .inc();
94        self.duration_seconds
95            .with_label_values(&[OUTCOME_SUCCESS])
96            .observe(ms_to_secs(duration_ms));
97        let ts = SystemTime::now()
98            .duration_since(UNIX_EPOCH)
99            .map(|d| d.as_secs_f64())
100            .unwrap_or(0.0);
101        self.last_success_ts.set(ts);
102    }
103
104    fn record_failure(&self, duration_ms: u64, reason: &'static str) {
105        self.outcomes_total
106            .with_label_values(&[OUTCOME_FAILURE])
107            .inc();
108        self.duration_seconds
109            .with_label_values(&[OUTCOME_FAILURE])
110            .observe(ms_to_secs(duration_ms));
111        self.failures_total.with_label_values(&[reason]).inc();
112    }
113
114    fn record_hold(&self, duration_s: u64) {
115        self.holds_total.inc();
116        self.hold_duration_seconds.observe(duration_s as f64);
117    }
118}