Skip to main content

solti_runner/metrics/
backend.rs

1//! # Metrics backend trait and label types.
2//!
3//! [`MetricsBackend`] is the abstraction for collecting task execution metrics.
4//! Concrete backends (e.g. `solti-prometheus`) implement this trait.
5//!
6//! See the [metrics module](super) for the convenience [`noop_metrics`](super::noop_metrics) constructor.
7
8use std::sync::Arc;
9
10/// Runner implementation type for metrics labeling.
11///
12/// Passed to [`MetricsBackend`] methods so dashboards can slice metrics by runner backend.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14#[non_exhaustive]
15pub enum RunnerType {
16    /// OS subprocess runner.
17    Subprocess,
18    /// Container (OCI) runner.
19    Container,
20    /// WebAssembly runner.
21    Wasm,
22}
23
24impl RunnerType {
25    /// Return label value for metrics.
26    #[inline]
27    pub fn as_label(self) -> &'static str {
28        match self {
29            Self::Subprocess => "subprocess",
30            Self::Container => "container",
31            Self::Wasm => "wasm",
32        }
33    }
34}
35
36/// Runner setup/teardown error kind for metrics labeling.
37///
38/// Passed to [`MetricsBackend::record_runner_error`] so dashboards can slice errors by a bounded, low-cardinality label rather than free-form strings.
39#[non_exhaustive]
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub enum RunnerErrorKind {
42    /// cgroup v2 preparation (creation / attribute write) failed.
43    CgroupPrepareFailed,
44    /// Applying runner-specific configuration to the task command failed (rlimits, capabilities, namespaces, …).
45    BackendConfigFailed,
46    /// Spawning the child process or actor failed.
47    SpawnFailed,
48    /// Loading the runner module (WASM / container image) failed.
49    ModuleLoadFailed,
50}
51
52impl RunnerErrorKind {
53    /// Return label value for metrics.
54    #[inline]
55    pub fn as_label(self) -> &'static str {
56        match self {
57            Self::CgroupPrepareFailed => "cgroup_prepare_failed",
58            Self::BackendConfigFailed => "backend_config_failed",
59            Self::SpawnFailed => "spawn_failed",
60            Self::ModuleLoadFailed => "module_load_failed",
61        }
62    }
63}
64
65/// Task execution outcome for metrics classification.
66#[non_exhaustive]
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68pub enum TaskOutcome {
69    /// Task completed successfully.
70    Success,
71    /// Task failed.
72    Failure,
73    /// Task canceled.
74    Canceled,
75    /// Task timeout.
76    Timeout,
77}
78
79impl TaskOutcome {
80    /// Return label value for metrics.
81    #[inline]
82    pub fn as_label(&self) -> &'static str {
83        match self {
84            TaskOutcome::Success => "success",
85            TaskOutcome::Failure => "failure",
86            TaskOutcome::Canceled => "canceled",
87            TaskOutcome::Timeout => "timeout",
88        }
89    }
90}
91
92/// Backend metrics collection interface.
93///
94/// This trait abstracts metrics collection across different backends.
95/// Implementations are injected via [`crate::BuildContext`] and used by all runners.
96///
97/// ## Also
98///
99/// - [`NoOpMetrics`](super::NoOpMetrics): zero-size default backend.
100/// - [`crate::BuildContext::metrics`]: access the handle from within a runner.
101/// - `solti-prometheus::PrometheusMetrics` is a production Prometheus implementation.
102pub trait MetricsBackend: Send + Sync + 'static {
103    /// Record task spawn event.
104    ///
105    /// Called when a task is submitted and starts executing.
106    fn record_task_started(&self, runner_type: RunnerType);
107
108    /// Record task completion with outcome and duration.
109    ///
110    /// Called when task exits (success, failure, timeout, cancel).
111    fn record_task_completed(
112        &self,
113        runner_type: RunnerType,
114        outcome: TaskOutcome,
115        duration_ms: u64,
116    );
117
118    /// Record runner-specific error during task setup/teardown.
119    ///
120    /// Called when runner fails to spawn/cleanup a task.
121    /// This is separate from task failures (which are `record_task_completed` with `Failure`).
122    fn record_runner_error(&self, runner_type: RunnerType, error_kind: RunnerErrorKind);
123}
124
125/// Shared handle to metrics backend.
126///
127/// Stored in [`crate::BuildContext`] and cloned into each task.
128pub type MetricsHandle = Arc<dyn MetricsBackend>;
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133
134    #[test]
135    fn runner_error_kind_as_label_maps_all_variants() {
136        assert_eq!(
137            RunnerErrorKind::CgroupPrepareFailed.as_label(),
138            "cgroup_prepare_failed"
139        );
140        assert_eq!(
141            RunnerErrorKind::BackendConfigFailed.as_label(),
142            "backend_config_failed"
143        );
144        assert_eq!(RunnerErrorKind::SpawnFailed.as_label(), "spawn_failed");
145        assert_eq!(
146            RunnerErrorKind::ModuleLoadFailed.as_label(),
147            "module_load_failed"
148        );
149    }
150}