Skip to main content

forge_sandbox/
metrics.rs

1//! Prometheus metrics for the Forge sandbox.
2//!
3//! This module is only compiled when the `metrics` feature is enabled.
4//! Provides counters, histograms, and gauges for sandbox execution observability.
5
6use prometheus_client::encoding::EncodeLabelSet;
7use prometheus_client::metrics::counter::Counter;
8use prometheus_client::metrics::family::Family;
9use prometheus_client::metrics::gauge::Gauge;
10use prometheus_client::metrics::histogram::Histogram;
11use prometheus_client::registry::Registry;
12use std::sync::atomic::AtomicI64;
13
14/// Label set for execution metrics.
15#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
16pub struct ExecutionLabels {
17    /// The operation type: "search" or "execute".
18    pub operation: String,
19}
20
21/// Label set for error metrics.
22#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
23pub struct ErrorLabels {
24    /// The error kind: "timeout", "heap_limit", "js_error", "execution".
25    pub error_kind: String,
26}
27
28/// Prometheus metrics for the Forge sandbox.
29pub struct ForgeMetrics {
30    /// Total number of executions.
31    pub executions_total: Family<ExecutionLabels, Counter>,
32    /// Execution duration in seconds.
33    pub execution_duration_seconds: Family<ExecutionLabels, Histogram>,
34    /// Total number of errors by kind.
35    pub errors_total: Family<ErrorLabels, Counter>,
36    /// Current number of workers in the pool (bridged from PoolMetrics atomics).
37    pub pool_workers_alive: Gauge<i64, AtomicI64>,
38}
39
40impl ForgeMetrics {
41    /// Create a new `ForgeMetrics` and register all metrics with the given registry.
42    pub fn new(registry: &mut Registry) -> Self {
43        let executions_total = Family::default();
44        registry.register(
45            "forge_executions_total",
46            "Total sandbox executions",
47            executions_total.clone(),
48        );
49
50        let execution_duration_seconds =
51            Family::<ExecutionLabels, Histogram>::new_with_constructor(|| {
52                Histogram::new(
53                    [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0].into_iter(),
54                )
55            });
56        registry.register(
57            "forge_execution_duration_seconds",
58            "Sandbox execution duration",
59            execution_duration_seconds.clone(),
60        );
61
62        let errors_total = Family::default();
63        registry.register(
64            "forge_errors_total",
65            "Total sandbox errors by kind",
66            errors_total.clone(),
67        );
68
69        let pool_workers_alive = Gauge::default();
70        registry.register(
71            "forge_pool_workers_alive",
72            "Current workers alive in pool",
73            pool_workers_alive.clone(),
74        );
75
76        Self {
77            executions_total,
78            execution_duration_seconds,
79            errors_total,
80            pool_workers_alive,
81        }
82    }
83
84    /// Record a successful execution.
85    pub fn record_execution(&self, operation: &str, duration_secs: f64) {
86        let labels = ExecutionLabels {
87            operation: operation.to_string(),
88        };
89        self.executions_total.get_or_create(&labels).inc();
90        self.execution_duration_seconds
91            .get_or_create(&labels)
92            .observe(duration_secs);
93    }
94
95    /// Record an error.
96    pub fn record_error(&self, error_kind: &str) {
97        let labels = ErrorLabels {
98            error_kind: error_kind.to_string(),
99        };
100        self.errors_total.get_or_create(&labels).inc();
101    }
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107    use prometheus_client::encoding::text::encode;
108
109    #[test]
110    fn metrics_01_forge_metrics_creates_counters() {
111        let mut registry = Registry::default();
112        let metrics = ForgeMetrics::new(&mut registry);
113        // Should not panic
114        let _ = metrics;
115    }
116
117    #[test]
118    fn metrics_02_execution_counter_increments() {
119        let mut registry = Registry::default();
120        let metrics = ForgeMetrics::new(&mut registry);
121        metrics.record_execution("execute", 0.5);
122        metrics.record_execution("execute", 1.0);
123        metrics.record_execution("search", 0.1);
124
125        let labels = ExecutionLabels {
126            operation: "execute".into(),
127        };
128        let count = metrics.executions_total.get_or_create(&labels).get();
129        assert_eq!(count, 2);
130    }
131
132    #[test]
133    fn metrics_03_error_counter_increments_on_failure() {
134        let mut registry = Registry::default();
135        let metrics = ForgeMetrics::new(&mut registry);
136        metrics.record_error("timeout");
137        metrics.record_error("timeout");
138        metrics.record_error("js_error");
139
140        let labels = ErrorLabels {
141            error_kind: "timeout".into(),
142        };
143        let count = metrics.errors_total.get_or_create(&labels).get();
144        assert_eq!(count, 2);
145    }
146
147    #[test]
148    fn metrics_04_pool_gauge_bridges_atomic_counters() {
149        let mut registry = Registry::default();
150        let metrics = ForgeMetrics::new(&mut registry);
151        metrics.pool_workers_alive.set(5);
152        assert_eq!(metrics.pool_workers_alive.get(), 5);
153    }
154
155    #[test]
156    fn metrics_05_duration_histogram_records() {
157        let mut registry = Registry::default();
158        let metrics = ForgeMetrics::new(&mut registry);
159        metrics.record_execution("execute", 0.05);
160        metrics.record_execution("execute", 2.5);
161        // No assertion on bucket counts, just verify it doesn't panic
162    }
163
164    #[test]
165    fn metrics_06_metrics_encode_to_text() {
166        let mut registry = Registry::default();
167        let metrics = ForgeMetrics::new(&mut registry);
168        metrics.record_execution("execute", 1.0);
169        metrics.record_error("timeout");
170
171        let mut buf = String::new();
172        encode(&mut buf, &registry).unwrap();
173
174        assert!(
175            buf.contains("forge_executions_total"),
176            "should contain execution counter: {buf}"
177        );
178        assert!(
179            buf.contains("forge_errors_total"),
180            "should contain error counter: {buf}"
181        );
182    }
183
184    #[test]
185    fn metrics_08_metrics_thread_safe() {
186        let mut registry = Registry::default();
187        let metrics = std::sync::Arc::new(ForgeMetrics::new(&mut registry));
188
189        let m1 = metrics.clone();
190        let h1 = std::thread::spawn(move || {
191            m1.record_execution("execute", 0.1);
192        });
193
194        let m2 = metrics.clone();
195        let h2 = std::thread::spawn(move || {
196            m2.record_error("js_error");
197        });
198
199        h1.join().unwrap();
200        h2.join().unwrap();
201        // No assertions — just verify no data races
202    }
203}