Skip to main content

solti_prometheus/
backend.rs

1//! # Runner-level Prometheus metrics.
2//!
3//! [`PrometheusMetrics`] implements [`MetricsBackend`] and exposes counters and histograms for task execution events reported by runners.
4//!
5//! See the [crate root](crate) for architecture and namespace overview.
6
7use std::sync::Arc;
8
9use prometheus::{CounterVec, HistogramVec, Registry, proto::MetricFamily};
10
11use solti_runner::{MetricsBackend, RunnerErrorKind, RunnerType, TaskOutcome};
12
13use crate::register::{Sub, ms_to_secs};
14
15/// Prometheus metrics backend for solti runners.
16///
17/// Implements [`MetricsBackend`] and exposes runner-level metrics in Prometheus format.
18/// Runners call the trait methods during task lifecycle.
19///
20/// ## Metrics
21///
22/// | Metric                               | Type      | Labels              | Description                    |
23/// |--------------------------------------|-----------|---------------------|--------------------------------|
24/// | `solti_runner_tasks_started_total`   | Counter   | `runner`            | Task spawn events              |
25/// | `solti_runner_tasks_completed_total` | Counter   | `runner`, `outcome` | Task completion events         |
26/// | `solti_runner_task_duration_seconds` | Histogram | `runner`, `outcome` | Per-attempt execution duration |
27/// | `solti_runner_errors_total`          | Counter   | `runner`, `error`   | Runner setup/teardown errors   |
28///
29/// ## Labels
30///
31/// All label sets have low, bounded cardinality:
32///
33/// | Label     | Values                                                                                                            | Cardinality |
34/// |-----------|-------------------------------------------------------------------------------------------------------------------|-------------|
35/// | `runner`  | `subprocess`, `wasm`, `container`                                                                                 | Low         |
36/// | `outcome` | `success`, `failure`, `canceled`, `timeout`                                                                       | Low         |
37/// | `error`   | `cgroup_prepare_failed`, `backend_config_failed`, `spawn_failed`, `module_load_failed` (from [`RunnerErrorKind`]) | Low         |
38///
39/// ## Duration histogram buckets
40///
41/// Buckets (seconds): `0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 300, 1800, 3600`.
42///
43/// ## Also
44///
45/// - [`PrometheusSubscriber`](crate::PrometheusSubscriber) is a supervision-level metrics from the event stream.
46/// - [`Registry`](prometheus::Registry) is a shared registry for unified `/metrics` endpoint.
47pub struct PrometheusMetrics {
48    tasks_started: CounterVec,
49    tasks_completed: CounterVec,
50    tasks_duration: HistogramVec,
51    runner_errors: CounterVec,
52    registry: Arc<Registry>,
53}
54
55impl PrometheusMetrics {
56    /// Create a new metrics backend, registering all counters and histograms into the given [`Registry`].
57    ///
58    /// Primary constructor — mirrors the shape used by other backends in this
59    /// crate ([`PrometheusSubscriber::new`](crate::PrometheusSubscriber::new),
60    /// `PrometheusApiMetrics::new`, `PrometheusDiscoverMetrics::new`).
61    pub fn new(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
62        let r = Sub::new(&registry, "runner");
63
64        let tasks_started = r.counter_vec(
65            "tasks_started_total",
66            "Total number of tasks started",
67            &["runner"],
68        )?;
69        let tasks_completed = r.counter_vec(
70            "tasks_completed_total",
71            "Total number of tasks completed",
72            &["runner", "outcome"],
73        )?;
74        let tasks_duration = r.histogram_vec(
75            "task_duration_seconds",
76            "Task execution duration in seconds",
77            vec![
78                0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 300.0, 1800.0,
79                3600.0,
80            ],
81            &["runner", "outcome"],
82        )?;
83        let runner_errors = r.counter_vec(
84            "errors_total",
85            "Total runner-level errors",
86            &["runner", "error"],
87        )?;
88
89        Ok(Self {
90            tasks_started,
91            tasks_completed,
92            tasks_duration,
93            runner_errors,
94            registry,
95        })
96    }
97
98    /// Create a new metrics backend with an **isolated** registry.
99    ///
100    /// Convenience for tests / standalone use. Most agents share a single
101    /// registry across collectors via [`Self::new`].
102    pub fn new_isolated() -> Result<Self, prometheus::Error> {
103        Self::new(Arc::new(Registry::new()))
104    }
105
106    /// Deprecated alias of [`Self::new`].
107    #[deprecated(
108        since = "0.0.2",
109        note = "use `PrometheusMetrics::new(registry)` — same signature, consistent with the other backends"
110    )]
111    pub fn new_with_registry(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
112        Self::new(registry)
113    }
114
115    /// Gather all metrics for exposition.
116    pub fn gather(&self) -> Vec<MetricFamily> {
117        self.registry.gather()
118    }
119
120    /// Get reference to underlying prometheus registry.
121    pub fn registry(&self) -> &Arc<Registry> {
122        &self.registry
123    }
124}
125
126impl std::fmt::Debug for PrometheusMetrics {
127    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128        f.debug_struct("PrometheusMetrics").finish()
129    }
130}
131
132impl MetricsBackend for PrometheusMetrics {
133    /// Increments `solti_runner_tasks_started_total{runner=<runner_type>}`.
134    fn record_task_started(&self, runner_type: RunnerType) {
135        self.tasks_started
136            .with_label_values(&[runner_type.as_label()])
137            .inc();
138    }
139
140    /// Records a task completion event.
141    ///
142    /// Updates two metrics:
143    /// - `solti_runner_tasks_completed_total{runner, outcome}` - incremented by 1.
144    /// - `solti_runner_task_duration_seconds{runner, outcome}` - observes the duration converted from milliseconds to seconds.
145    ///
146    /// The `outcome` label is derived from [`TaskOutcome::as_label`]: `success` | `failure` | `canceled` | `timeout`.
147    fn record_task_completed(
148        &self,
149        runner_type: RunnerType,
150        outcome: TaskOutcome,
151        duration_ms: u64,
152    ) {
153        let runner = runner_type.as_label();
154        let label = outcome.as_label();
155
156        self.tasks_completed
157            .with_label_values(&[runner, label])
158            .inc();
159        self.tasks_duration
160            .with_label_values(&[runner, label])
161            .observe(ms_to_secs(duration_ms));
162    }
163
164    /// Increments `solti_runner_errors_total{runner=<runner_type>, error=<error_kind>}`.
165    ///
166    /// Called for runner setup/teardown errors (e.g. spawn failures), **not** for task-level failures
167    /// which go through [`record_task_completed`](MetricsBackend::record_task_completed).
168    fn record_runner_error(&self, runner_type: RunnerType, error_kind: RunnerErrorKind) {
169        self.runner_errors
170            .with_label_values(&[runner_type.as_label(), error_kind.as_label()])
171            .inc();
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn can_create_prometheus_metrics() {
181        let _metrics = PrometheusMetrics::new_isolated().expect("failed to create metrics");
182    }
183
184    #[test]
185    fn record_task_started_increments_counter() {
186        let metrics = PrometheusMetrics::new_isolated().unwrap();
187
188        metrics.record_task_started(RunnerType::Subprocess);
189        metrics.record_task_started(RunnerType::Subprocess);
190        metrics.record_task_started(RunnerType::Wasm);
191
192        let families = metrics.gather();
193        let started = families
194            .iter()
195            .find(|f| f.name() == "solti_runner_tasks_started_total")
196            .expect("metric not found");
197
198        assert_eq!(started.get_metric().len(), 2);
199    }
200
201    #[test]
202    fn record_task_completed_increments_counter_and_histogram() {
203        let metrics = PrometheusMetrics::new_isolated().unwrap();
204
205        metrics.record_task_completed(RunnerType::Subprocess, TaskOutcome::Success, 150);
206        metrics.record_task_completed(RunnerType::Subprocess, TaskOutcome::Failure, 50);
207
208        let families = metrics.gather();
209
210        let completed = families
211            .iter()
212            .find(|f| f.name() == "solti_runner_tasks_completed_total")
213            .expect("completed counter not found");
214        assert_eq!(completed.get_metric().len(), 2);
215
216        let duration = families
217            .iter()
218            .find(|f| f.name() == "solti_runner_task_duration_seconds")
219            .expect("duration histogram not found");
220        assert_eq!(duration.get_metric().len(), 2);
221    }
222
223    #[test]
224    fn record_runner_error_increments_counter() {
225        let metrics = PrometheusMetrics::new_isolated().unwrap();
226
227        metrics.record_runner_error(RunnerType::Subprocess, RunnerErrorKind::SpawnFailed);
228        metrics.record_runner_error(RunnerType::Subprocess, RunnerErrorKind::SpawnFailed);
229        metrics.record_runner_error(RunnerType::Wasm, RunnerErrorKind::ModuleLoadFailed);
230
231        let families = metrics.gather();
232        let errors = families
233            .iter()
234            .find(|f| f.name() == "solti_runner_errors_total")
235            .expect("errors counter not found");
236
237        assert_eq!(errors.get_metric().len(), 2);
238    }
239
240    #[test]
241    fn can_use_custom_registry() {
242        let registry = Arc::new(Registry::new());
243        let metrics = PrometheusMetrics::new(registry.clone()).unwrap();
244
245        metrics.record_task_started(RunnerType::Subprocess);
246        assert!(!registry.gather().is_empty());
247    }
248}