solti_prometheus/
backend.rs1use std::sync::Arc;
8
9use prometheus::{CounterVec, HistogramVec, Registry, proto::MetricFamily};
10
11use solti_runner::{MetricsBackend, RunnerErrorKind, RunnerType, TaskOutcome};
12
13use crate::register::{Sub, ms_to_secs};
14
15pub struct PrometheusMetrics {
48 tasks_started: CounterVec,
49 tasks_completed: CounterVec,
50 tasks_duration: HistogramVec,
51 runner_errors: CounterVec,
52 registry: Arc<Registry>,
53}
54
55impl PrometheusMetrics {
56 pub fn new(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
62 let r = Sub::new(®istry, "runner");
63
64 let tasks_started = r.counter_vec(
65 "tasks_started_total",
66 "Total number of tasks started",
67 &["runner"],
68 )?;
69 let tasks_completed = r.counter_vec(
70 "tasks_completed_total",
71 "Total number of tasks completed",
72 &["runner", "outcome"],
73 )?;
74 let tasks_duration = r.histogram_vec(
75 "task_duration_seconds",
76 "Task execution duration in seconds",
77 vec![
78 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 300.0, 1800.0,
79 3600.0,
80 ],
81 &["runner", "outcome"],
82 )?;
83 let runner_errors = r.counter_vec(
84 "errors_total",
85 "Total runner-level errors",
86 &["runner", "error"],
87 )?;
88
89 Ok(Self {
90 tasks_started,
91 tasks_completed,
92 tasks_duration,
93 runner_errors,
94 registry,
95 })
96 }
97
98 pub fn new_isolated() -> Result<Self, prometheus::Error> {
103 Self::new(Arc::new(Registry::new()))
104 }
105
106 #[deprecated(
108 since = "0.0.2",
109 note = "use `PrometheusMetrics::new(registry)` — same signature, consistent with the other backends"
110 )]
111 pub fn new_with_registry(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
112 Self::new(registry)
113 }
114
115 pub fn gather(&self) -> Vec<MetricFamily> {
117 self.registry.gather()
118 }
119
120 pub fn registry(&self) -> &Arc<Registry> {
122 &self.registry
123 }
124}
125
126impl std::fmt::Debug for PrometheusMetrics {
127 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128 f.debug_struct("PrometheusMetrics").finish()
129 }
130}
131
132impl MetricsBackend for PrometheusMetrics {
133 fn record_task_started(&self, runner_type: RunnerType) {
135 self.tasks_started
136 .with_label_values(&[runner_type.as_label()])
137 .inc();
138 }
139
140 fn record_task_completed(
148 &self,
149 runner_type: RunnerType,
150 outcome: TaskOutcome,
151 duration_ms: u64,
152 ) {
153 let runner = runner_type.as_label();
154 let label = outcome.as_label();
155
156 self.tasks_completed
157 .with_label_values(&[runner, label])
158 .inc();
159 self.tasks_duration
160 .with_label_values(&[runner, label])
161 .observe(ms_to_secs(duration_ms));
162 }
163
164 fn record_runner_error(&self, runner_type: RunnerType, error_kind: RunnerErrorKind) {
169 self.runner_errors
170 .with_label_values(&[runner_type.as_label(), error_kind.as_label()])
171 .inc();
172 }
173}
174
175#[cfg(test)]
176mod tests {
177 use super::*;
178
179 #[test]
180 fn can_create_prometheus_metrics() {
181 let _metrics = PrometheusMetrics::new_isolated().expect("failed to create metrics");
182 }
183
184 #[test]
185 fn record_task_started_increments_counter() {
186 let metrics = PrometheusMetrics::new_isolated().unwrap();
187
188 metrics.record_task_started(RunnerType::Subprocess);
189 metrics.record_task_started(RunnerType::Subprocess);
190 metrics.record_task_started(RunnerType::Wasm);
191
192 let families = metrics.gather();
193 let started = families
194 .iter()
195 .find(|f| f.name() == "solti_runner_tasks_started_total")
196 .expect("metric not found");
197
198 assert_eq!(started.get_metric().len(), 2);
199 }
200
201 #[test]
202 fn record_task_completed_increments_counter_and_histogram() {
203 let metrics = PrometheusMetrics::new_isolated().unwrap();
204
205 metrics.record_task_completed(RunnerType::Subprocess, TaskOutcome::Success, 150);
206 metrics.record_task_completed(RunnerType::Subprocess, TaskOutcome::Failure, 50);
207
208 let families = metrics.gather();
209
210 let completed = families
211 .iter()
212 .find(|f| f.name() == "solti_runner_tasks_completed_total")
213 .expect("completed counter not found");
214 assert_eq!(completed.get_metric().len(), 2);
215
216 let duration = families
217 .iter()
218 .find(|f| f.name() == "solti_runner_task_duration_seconds")
219 .expect("duration histogram not found");
220 assert_eq!(duration.get_metric().len(), 2);
221 }
222
223 #[test]
224 fn record_runner_error_increments_counter() {
225 let metrics = PrometheusMetrics::new_isolated().unwrap();
226
227 metrics.record_runner_error(RunnerType::Subprocess, RunnerErrorKind::SpawnFailed);
228 metrics.record_runner_error(RunnerType::Subprocess, RunnerErrorKind::SpawnFailed);
229 metrics.record_runner_error(RunnerType::Wasm, RunnerErrorKind::ModuleLoadFailed);
230
231 let families = metrics.gather();
232 let errors = families
233 .iter()
234 .find(|f| f.name() == "solti_runner_errors_total")
235 .expect("errors counter not found");
236
237 assert_eq!(errors.get_metric().len(), 2);
238 }
239
240 #[test]
241 fn can_use_custom_registry() {
242 let registry = Arc::new(Registry::new());
243 let metrics = PrometheusMetrics::new(registry.clone()).unwrap();
244
245 metrics.record_task_started(RunnerType::Subprocess);
246 assert!(!registry.gather().is_empty());
247 }
248}