use std::sync::Arc;
use prometheus::{CounterVec, HistogramVec, Registry, proto::MetricFamily};
use solti_runner::{MetricsBackend, RunnerErrorKind, RunnerType, TaskOutcome};
use crate::register::{Sub, ms_to_secs};
pub struct PrometheusMetrics {
tasks_started: CounterVec,
tasks_completed: CounterVec,
tasks_duration: HistogramVec,
runner_errors: CounterVec,
registry: Arc<Registry>,
}
impl PrometheusMetrics {
pub fn new(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
let r = Sub::new(®istry, "runner");
let tasks_started = r.counter_vec(
"tasks_started_total",
"Total number of tasks started",
&["runner"],
)?;
let tasks_completed = r.counter_vec(
"tasks_completed_total",
"Total number of tasks completed",
&["runner", "outcome"],
)?;
let tasks_duration = r.histogram_vec(
"task_duration_seconds",
"Task execution duration in seconds",
vec![
0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 300.0, 1800.0,
3600.0,
],
&["runner", "outcome"],
)?;
let runner_errors = r.counter_vec(
"errors_total",
"Total runner-level errors",
&["runner", "error"],
)?;
Ok(Self {
tasks_started,
tasks_completed,
tasks_duration,
runner_errors,
registry,
})
}
pub fn new_isolated() -> Result<Self, prometheus::Error> {
Self::new(Arc::new(Registry::new()))
}
#[deprecated(
since = "0.0.2",
note = "use `PrometheusMetrics::new(registry)` — same signature, consistent with the other backends"
)]
pub fn new_with_registry(registry: Arc<Registry>) -> Result<Self, prometheus::Error> {
Self::new(registry)
}
pub fn gather(&self) -> Vec<MetricFamily> {
self.registry.gather()
}
pub fn registry(&self) -> &Arc<Registry> {
&self.registry
}
}
impl std::fmt::Debug for PrometheusMetrics {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PrometheusMetrics").finish()
}
}
impl MetricsBackend for PrometheusMetrics {
fn record_task_started(&self, runner_type: RunnerType) {
self.tasks_started
.with_label_values(&[runner_type.as_label()])
.inc();
}
fn record_task_completed(
&self,
runner_type: RunnerType,
outcome: TaskOutcome,
duration_ms: u64,
) {
let runner = runner_type.as_label();
let label = outcome.as_label();
self.tasks_completed
.with_label_values(&[runner, label])
.inc();
self.tasks_duration
.with_label_values(&[runner, label])
.observe(ms_to_secs(duration_ms));
}
fn record_runner_error(&self, runner_type: RunnerType, error_kind: RunnerErrorKind) {
self.runner_errors
.with_label_values(&[runner_type.as_label(), error_kind.as_label()])
.inc();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn can_create_prometheus_metrics() {
let _metrics = PrometheusMetrics::new_isolated().expect("failed to create metrics");
}
#[test]
fn record_task_started_increments_counter() {
let metrics = PrometheusMetrics::new_isolated().unwrap();
metrics.record_task_started(RunnerType::Subprocess);
metrics.record_task_started(RunnerType::Subprocess);
metrics.record_task_started(RunnerType::Wasm);
let families = metrics.gather();
let started = families
.iter()
.find(|f| f.name() == "solti_runner_tasks_started_total")
.expect("metric not found");
assert_eq!(started.get_metric().len(), 2);
}
#[test]
fn record_task_completed_increments_counter_and_histogram() {
let metrics = PrometheusMetrics::new_isolated().unwrap();
metrics.record_task_completed(RunnerType::Subprocess, TaskOutcome::Success, 150);
metrics.record_task_completed(RunnerType::Subprocess, TaskOutcome::Failure, 50);
let families = metrics.gather();
let completed = families
.iter()
.find(|f| f.name() == "solti_runner_tasks_completed_total")
.expect("completed counter not found");
assert_eq!(completed.get_metric().len(), 2);
let duration = families
.iter()
.find(|f| f.name() == "solti_runner_task_duration_seconds")
.expect("duration histogram not found");
assert_eq!(duration.get_metric().len(), 2);
}
#[test]
fn record_runner_error_increments_counter() {
let metrics = PrometheusMetrics::new_isolated().unwrap();
metrics.record_runner_error(RunnerType::Subprocess, RunnerErrorKind::SpawnFailed);
metrics.record_runner_error(RunnerType::Subprocess, RunnerErrorKind::SpawnFailed);
metrics.record_runner_error(RunnerType::Wasm, RunnerErrorKind::ModuleLoadFailed);
let families = metrics.gather();
let errors = families
.iter()
.find(|f| f.name() == "solti_runner_errors_total")
.expect("errors counter not found");
assert_eq!(errors.get_metric().len(), 2);
}
#[test]
fn can_use_custom_registry() {
let registry = Arc::new(Registry::new());
let metrics = PrometheusMetrics::new(registry.clone()).unwrap();
metrics.record_task_started(RunnerType::Subprocess);
assert!(!registry.gather().is_empty());
}
}