lightshuttle_control/metrics.rs
1//! Prometheus metrics for the control plane.
2//!
3//! Metrics are exposed in the Prometheus text exposition format on
4//! `GET /metrics`. The recorder is installed once per process by
5//! [`Metrics::install`]; tests build a non-installing handle via
6//! [`Metrics::for_test`] so multiple control servers can coexist
7//! without panicking on a double global install.
8
9use std::time::Instant;
10
11use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram};
12use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
13
14/// Counter incremented on every accepted restart request.
15pub(crate) const RESTART_TOTAL: &str = "lightshuttle_restart_total";
16
17/// Histogram of the seconds a resource takes to go from started to
18/// healthy.
19pub(crate) const EVENT_DURATION: &str = "lightshuttle_lifecycle_event_duration_seconds";
20
21/// Gauge of resource count, labelled by status.
22const RESOURCES: &str = "lightshuttle_resources";
23
24/// Gauge of orchestrator uptime in seconds.
25const UPTIME: &str = "lightshuttle_uptime_seconds";
26
27/// Holds the Prometheus render handle and the process start instant.
28pub struct Metrics {
29 handle: PrometheusHandle,
30 started: Instant,
31}
32
33impl Metrics {
34 /// Install the global Prometheus recorder and describe every
35 /// metric. Call exactly once per process, before any metric is
36 /// recorded.
37 ///
38 /// # Panics
39 ///
40 /// Panics if a global recorder is already installed.
41 #[must_use]
42 pub fn install() -> Self {
43 let handle = PrometheusBuilder::new()
44 .install_recorder()
45 .expect("failed to install the Prometheus recorder");
46 describe_metrics();
47 Self {
48 handle,
49 started: Instant::now(),
50 }
51 }
52
53 /// Build a non-installing handle for tests.
54 ///
55 /// The `metrics!` macros always target the globally installed
56 /// recorder, which this constructor never sets. The returned handle
57 /// therefore renders an empty snapshot regardless of any metric
58 /// recorded elsewhere. Use [`Self::install`] plus
59 /// [`super::ControlState::with_metrics`] to serve live metrics.
60 #[must_use]
61 pub fn for_test() -> Self {
62 let recorder = PrometheusBuilder::new().build_recorder();
63 let handle = recorder.handle();
64 Self {
65 handle,
66 started: Instant::now(),
67 }
68 }
69
70 /// Render the current metrics, refreshing the scrape-time gauges
71 /// (`lightshuttle_resources` per status and
72 /// `lightshuttle_uptime_seconds`) just before serialising.
73 #[must_use]
74 pub fn render(&self, status_counts: &[(&str, u64)]) -> String {
75 for (status, count) in status_counts {
76 #[allow(clippy::cast_precision_loss)]
77 gauge!(RESOURCES, "status" => (*status).to_owned()).set(*count as f64);
78 }
79 #[allow(clippy::cast_precision_loss)]
80 gauge!(UPTIME).set(self.started.elapsed().as_secs_f64());
81 self.handle.render()
82 }
83}
84
85/// Increment the restart counter. Safe to call from anywhere once the
86/// recorder is installed; a no-op when no recorder is present.
87pub(crate) fn record_restart() {
88 counter!(RESTART_TOTAL).increment(1);
89}
90
91/// Observe the seconds a resource took to become healthy. Safe to
92/// call from anywhere once the recorder is installed; a no-op when no
93/// recorder is present.
94pub fn observe_event_duration(seconds: f64) {
95 histogram!(EVENT_DURATION).record(seconds);
96}
97
98fn describe_metrics() {
99 describe_counter!(RESTART_TOTAL, "Total number of accepted restart requests");
100 describe_histogram!(
101 EVENT_DURATION,
102 "Seconds a resource takes to go from started to healthy"
103 );
104 describe_gauge!(RESOURCES, "Number of managed resources, labelled by status");
105 describe_gauge!(UPTIME, "Orchestrator uptime in seconds");
106}