Skip to main content

lightshuttle_control/
metrics.rs

1//! Prometheus metrics for the control plane.
2//!
3//! Metrics are exposed in the Prometheus text exposition format on
4//! `GET /metrics`. The recorder is installed once per process by
5//! [`Metrics::install`]; tests build a non-installing handle via
6//! [`Metrics::for_test`] so multiple control servers can coexist
7//! without panicking on a double global install.
8
9use std::time::Instant;
10
11use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram};
12use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
13
14/// Counter incremented on every accepted restart request.
15pub(crate) const RESTART_TOTAL: &str = "lightshuttle_restart_total";
16
17/// Histogram of the seconds a resource takes to go from started to
18/// healthy.
19pub(crate) const EVENT_DURATION: &str = "lightshuttle_lifecycle_event_duration_seconds";
20
21/// Gauge of resource count, labelled by status.
22const RESOURCES: &str = "lightshuttle_resources";
23
24/// Gauge of orchestrator uptime in seconds.
25const UPTIME: &str = "lightshuttle_uptime_seconds";
26
27/// Holds the Prometheus render handle and the process start instant.
28pub struct Metrics {
29    handle: PrometheusHandle,
30    started: Instant,
31}
32
33impl Metrics {
34    /// Install the global Prometheus recorder and describe every
35    /// metric. Call exactly once per process, before any metric is
36    /// recorded.
37    ///
38    /// # Panics
39    ///
40    /// Panics if a global recorder is already installed.
41    #[must_use]
42    pub fn install() -> Self {
43        let handle = PrometheusBuilder::new()
44            .install_recorder()
45            .expect("failed to install the Prometheus recorder");
46        describe_metrics();
47        Self {
48            handle,
49            started: Instant::now(),
50        }
51    }
52
53    /// Build a non-installing handle for tests.
54    ///
55    /// The `metrics!` macros always target the globally installed
56    /// recorder, which this constructor never sets. The returned handle
57    /// therefore renders an empty snapshot regardless of any metric
58    /// recorded elsewhere. Use [`Self::install`] plus
59    /// [`super::ControlState::with_metrics`] to serve live metrics.
60    #[must_use]
61    pub fn for_test() -> Self {
62        let recorder = PrometheusBuilder::new().build_recorder();
63        let handle = recorder.handle();
64        Self {
65            handle,
66            started: Instant::now(),
67        }
68    }
69
70    /// Render the current metrics, refreshing the scrape-time gauges
71    /// (`lightshuttle_resources` per status and
72    /// `lightshuttle_uptime_seconds`) just before serialising.
73    #[must_use]
74    pub fn render(&self, status_counts: &[(&str, u64)]) -> String {
75        for (status, count) in status_counts {
76            #[allow(clippy::cast_precision_loss)]
77            gauge!(RESOURCES, "status" => (*status).to_owned()).set(*count as f64);
78        }
79        #[allow(clippy::cast_precision_loss)]
80        gauge!(UPTIME).set(self.started.elapsed().as_secs_f64());
81        self.handle.render()
82    }
83}
84
85/// Increment the restart counter. Safe to call from anywhere once the
86/// recorder is installed; a no-op when no recorder is present.
87pub(crate) fn record_restart() {
88    counter!(RESTART_TOTAL).increment(1);
89}
90
91/// Observe the seconds a resource took to become healthy. Safe to
92/// call from anywhere once the recorder is installed; a no-op when no
93/// recorder is present.
94pub fn observe_event_duration(seconds: f64) {
95    histogram!(EVENT_DURATION).record(seconds);
96}
97
98fn describe_metrics() {
99    describe_counter!(RESTART_TOTAL, "Total number of accepted restart requests");
100    describe_histogram!(
101        EVENT_DURATION,
102        "Seconds a resource takes to go from started to healthy"
103    );
104    describe_gauge!(RESOURCES, "Number of managed resources, labelled by status");
105    describe_gauge!(UPTIME, "Orchestrator uptime in seconds");
106}