use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use prometheus::{Counter, CounterVec, Gauge, Histogram, HistogramVec, Registry};
use solti_discover::{DiscoverMetricsBackend, OUTCOME_FAILURE, OUTCOME_SUCCESS};
use crate::register::{Sub, ms_to_secs};
pub struct PrometheusDiscoverMetrics {
attempts_total: Counter,
outcomes_total: CounterVec,
duration_seconds: HistogramVec,
failures_total: CounterVec,
last_success_ts: Gauge,
holds_total: Counter,
hold_duration_seconds: Histogram,
}
impl PrometheusDiscoverMetrics {
pub fn new(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
let r = Sub::new(®istry, "discover");
let attempts_total = r.counter("attempts_total", "Total discovery heartbeat attempts")?;
let outcomes_total = r.counter_vec(
"outcomes_total",
"Discovery heartbeat outcomes",
&["outcome"],
)?;
let duration_seconds = r.histogram_vec(
"duration_seconds",
"Discovery heartbeat call duration",
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0],
&["outcome"],
)?;
let failures_total = r.counter_vec(
"failures_total",
"Discovery heartbeat failures grouped by reason",
&["reason"],
)?;
let last_success_ts = r.gauge(
"last_success_timestamp_seconds",
"UNIX timestamp of the last successful heartbeat",
)?;
let holds_total = r.counter("holds_total", "Server-advised retry holds observed")?;
let hold_duration_seconds = r.histogram(
"hold_duration_seconds",
"Duration of server-advised retry holds",
vec![1.0, 5.0, 15.0, 30.0, 60.0, 300.0, 900.0, 1800.0, 3600.0],
)?;
Ok(Self {
attempts_total,
outcomes_total,
duration_seconds,
failures_total,
last_success_ts,
holds_total,
hold_duration_seconds,
})
}
}
impl std::fmt::Debug for PrometheusDiscoverMetrics {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PrometheusDiscoverMetrics").finish()
}
}
impl DiscoverMetricsBackend for PrometheusDiscoverMetrics {
fn record_attempt(&self) {
self.attempts_total.inc();
}
fn record_success(&self, duration_ms: u64) {
self.outcomes_total
.with_label_values(&[OUTCOME_SUCCESS])
.inc();
self.duration_seconds
.with_label_values(&[OUTCOME_SUCCESS])
.observe(ms_to_secs(duration_ms));
let ts = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs_f64())
.unwrap_or(0.0);
self.last_success_ts.set(ts);
}
fn record_failure(&self, duration_ms: u64, reason: &'static str) {
self.outcomes_total
.with_label_values(&[OUTCOME_FAILURE])
.inc();
self.duration_seconds
.with_label_values(&[OUTCOME_FAILURE])
.observe(ms_to_secs(duration_ms));
self.failures_total.with_label_values(&[reason]).inc();
}
fn record_hold(&self, duration_s: u64) {
self.holds_total.inc();
self.hold_duration_seconds.observe(duration_s as f64);
}
}