Skip to main content

arbiter_metrics/
lib.rs

1//! Arbiter Metrics: Prometheus-compatible metrics for the Arbiter proxy.
2//!
3//! Provides counters (requests, tool calls, anomalies), histograms (request and
4//! upstream duration), and gauges (active sessions, registered agents). Exposes
5//! a `/metrics` handler that returns metrics in the Prometheus text exposition
6//! format.
7
8use prometheus::{
9    Encoder, Histogram, HistogramOpts, IntCounter, IntCounterVec, IntGauge, Opts, Registry,
10    TextEncoder,
11};
12use std::collections::HashSet;
13use std::sync::Mutex;
14use thiserror::Error;
15
16/// Limit metric label cardinality to prevent memory exhaustion.
17/// If more than this many unique tool names are seen, new ones are bucketed under
18/// "__other__" to bound memory usage.
19const MAX_TOOL_LABEL_CARDINALITY: usize = 1000;
20
21/// Errors from the metrics subsystem.
22#[derive(Debug, Error)]
23pub enum MetricsError {
24    #[error("prometheus error: {0}")]
25    Prometheus(#[from] prometheus::Error),
26}
27
28/// All Arbiter proxy metrics, registered against a single [`Registry`].
29pub struct ArbiterMetrics {
30    registry: Registry,
31
32    /// Total requests by authorization decision (allow / deny / escalate).
33    pub requests_total: IntCounterVec,
34
35    /// Total tool calls by tool name.
36    pub tool_calls_total: IntCounterVec,
37
38    /// Total anomalies detected.
39    pub anomalies_total: IntCounter,
40
41    /// End-to-end request duration in seconds.
42    pub request_duration_seconds: Histogram,
43
44    /// Duration of the upstream (forwarded) call in seconds.
45    pub upstream_duration_seconds: Histogram,
46
47    /// Number of currently active task sessions.
48    pub active_sessions: IntGauge,
49
50    /// Number of currently registered agents.
51    pub registered_agents: IntGauge,
52
53    /// Tracks unique tool label values to enforce cardinality limits.
54    known_tools: Mutex<HashSet<String>>,
55}
56
57impl ArbiterMetrics {
58    /// Create and register all metrics against a new registry.
59    pub fn new() -> Result<Self, MetricsError> {
60        let registry = Registry::new();
61        Self::with_registry(registry)
62    }
63
64    /// Create and register all metrics against the provided registry.
65    pub fn with_registry(registry: Registry) -> Result<Self, MetricsError> {
66        let requests_total = IntCounterVec::new(
67            Opts::new("requests_total", "Total requests by authorization decision"),
68            &["decision"],
69        )?;
70        registry.register(Box::new(requests_total.clone()))?;
71
72        let tool_calls_total = IntCounterVec::new(
73            Opts::new("tool_calls_total", "Total tool calls by tool name"),
74            &["tool"],
75        )?;
76        registry.register(Box::new(tool_calls_total.clone()))?;
77
78        let anomalies_total =
79            IntCounter::with_opts(Opts::new("anomalies_total", "Total anomalies detected"))?;
80        registry.register(Box::new(anomalies_total.clone()))?;
81
82        let request_duration_seconds = Histogram::with_opts(
83            HistogramOpts::new(
84                "request_duration_seconds",
85                "End-to-end request duration in seconds",
86            )
87            .buckets(vec![
88                0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
89            ]),
90        )?;
91        registry.register(Box::new(request_duration_seconds.clone()))?;
92
93        let upstream_duration_seconds = Histogram::with_opts(
94            HistogramOpts::new(
95                "upstream_duration_seconds",
96                "Duration of upstream (forwarded) call in seconds",
97            )
98            .buckets(vec![
99                0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
100            ]),
101        )?;
102        registry.register(Box::new(upstream_duration_seconds.clone()))?;
103
104        let active_sessions = IntGauge::with_opts(Opts::new(
105            "active_sessions",
106            "Currently active task sessions",
107        ))?;
108        registry.register(Box::new(active_sessions.clone()))?;
109
110        let registered_agents = IntGauge::with_opts(Opts::new(
111            "registered_agents",
112            "Currently registered agents",
113        ))?;
114        registry.register(Box::new(registered_agents.clone()))?;
115
116        Ok(Self {
117            registry,
118            requests_total,
119            tool_calls_total,
120            anomalies_total,
121            request_duration_seconds,
122            upstream_duration_seconds,
123            active_sessions,
124            registered_agents,
125            known_tools: Mutex::new(HashSet::new()),
126        })
127    }
128
129    /// Record a request with the given authorization decision.
130    pub fn record_request(&self, decision: &str) {
131        // Sanitize decision label.
132        let sanitized: String = decision
133            .chars()
134            .take(64)
135            .filter(|c| c.is_ascii_alphanumeric() || *c == '_')
136            .collect();
137        self.requests_total.with_label_values(&[&sanitized]).inc();
138    }
139
140    /// Record a tool call for the given tool name.
141    pub fn record_tool_call(&self, tool: &str) {
142        // Sanitize and limit metric label cardinality.
143        let sanitized: String = tool
144            .chars()
145            .take(128)
146            .map(|c| {
147                if c.is_ascii_graphic() || c == ' ' {
148                    c
149                } else {
150                    '_'
151                }
152            })
153            .collect();
154
155        let label = {
156            let mut known = self.known_tools.lock().unwrap_or_else(|e| e.into_inner());
157            if known.contains(&sanitized) || known.len() < MAX_TOOL_LABEL_CARDINALITY {
158                known.insert(sanitized.clone());
159                sanitized
160            } else {
161                "__other__".to_string()
162            }
163        };
164        self.tool_calls_total.with_label_values(&[&label]).inc();
165    }
166
167    /// Record an anomaly.
168    pub fn record_anomaly(&self) {
169        self.anomalies_total.inc();
170    }
171
172    /// Observe a request duration in seconds.
173    pub fn observe_request_duration(&self, seconds: f64) {
174        self.request_duration_seconds.observe(seconds);
175    }
176
177    /// Observe an upstream call duration in seconds.
178    pub fn observe_upstream_duration(&self, seconds: f64) {
179        self.upstream_duration_seconds.observe(seconds);
180    }
181
182    /// Render all metrics in the Prometheus text exposition format.
183    pub fn render(&self) -> Result<String, MetricsError> {
184        let encoder = TextEncoder::new();
185        let metric_families = self.registry.gather();
186        let mut buffer = Vec::new();
187        encoder.encode(&metric_families, &mut buffer)?;
188        Ok(String::from_utf8_lossy(&buffer).into_owned())
189    }
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    #[test]
197    fn counter_increments() {
198        let metrics = ArbiterMetrics::new().unwrap();
199
200        metrics.record_request("allow");
201        metrics.record_request("allow");
202        metrics.record_request("deny");
203        metrics.record_tool_call("read_file");
204        metrics.record_tool_call("read_file");
205        metrics.record_tool_call("write_file");
206        metrics.record_anomaly();
207
208        assert_eq!(
209            metrics.requests_total.with_label_values(&["allow"]).get(),
210            2
211        );
212        assert_eq!(metrics.requests_total.with_label_values(&["deny"]).get(), 1);
213        assert_eq!(
214            metrics
215                .tool_calls_total
216                .with_label_values(&["read_file"])
217                .get(),
218            2
219        );
220        assert_eq!(
221            metrics
222                .tool_calls_total
223                .with_label_values(&["write_file"])
224                .get(),
225            1
226        );
227        assert_eq!(metrics.anomalies_total.get(), 1);
228    }
229
230    #[test]
231    fn metrics_endpoint_returns_valid_prometheus_format() {
232        let metrics = ArbiterMetrics::new().unwrap();
233
234        metrics.record_request("allow");
235        metrics.record_tool_call("list_dir");
236        metrics.observe_request_duration(0.042);
237        metrics.observe_upstream_duration(0.035);
238        metrics.active_sessions.set(3);
239        metrics.registered_agents.set(5);
240
241        let output = metrics.render().unwrap();
242
243        // Prometheus format: lines are either comments (# ...) or metric lines.
244        assert!(output.contains("requests_total"));
245        assert!(output.contains("tool_calls_total"));
246        assert!(output.contains("anomalies_total"));
247        assert!(output.contains("request_duration_seconds"));
248        assert!(output.contains("upstream_duration_seconds"));
249        assert!(output.contains("active_sessions 3"));
250        assert!(output.contains("registered_agents 5"));
251
252        // Verify HELP and TYPE lines exist (Prometheus convention).
253        assert!(output.contains("# HELP requests_total"));
254        assert!(output.contains("# TYPE requests_total counter"));
255        assert!(output.contains("# HELP request_duration_seconds"));
256        assert!(output.contains("# TYPE request_duration_seconds histogram"));
257    }
258
259    #[test]
260    fn histogram_buckets_are_present() {
261        let metrics = ArbiterMetrics::new().unwrap();
262        metrics.observe_request_duration(0.05);
263
264        let output = metrics.render().unwrap();
265
266        // Histograms should have _bucket, _sum, and _count lines.
267        assert!(output.contains("request_duration_seconds_bucket"));
268        assert!(output.contains("request_duration_seconds_sum"));
269        assert!(output.contains("request_duration_seconds_count"));
270    }
271
272    #[test]
273    fn gauges_can_increase_and_decrease() {
274        let metrics = ArbiterMetrics::new().unwrap();
275
276        metrics.active_sessions.set(10);
277        assert_eq!(metrics.active_sessions.get(), 10);
278
279        metrics.active_sessions.dec();
280        assert_eq!(metrics.active_sessions.get(), 9);
281
282        metrics.registered_agents.inc();
283        metrics.registered_agents.inc();
284        assert_eq!(metrics.registered_agents.get(), 2);
285    }
286
287    /// Cardinality limiting must cap unique tool labels at MAX_TOOL_LABEL_CARDINALITY.
288    /// The 1001st unique tool name should be bucketed under "__other__".
289    #[test]
290    fn cardinality_limiting_works() {
291        let metrics = ArbiterMetrics::new().unwrap();
292
293        // Record exactly MAX_TOOL_LABEL_CARDINALITY unique tool names
294        for i in 0..MAX_TOOL_LABEL_CARDINALITY {
295            metrics.record_tool_call(&format!("tool_{i}"));
296        }
297
298        // The next unique tool name should be bucketed under "__other__"
299        metrics.record_tool_call("tool_overflow_a");
300        metrics.record_tool_call("tool_overflow_b");
301
302        // Verify __other__ got the overflow calls
303        let other_count = metrics
304            .tool_calls_total
305            .with_label_values(&["__other__"])
306            .get();
307        assert_eq!(
308            other_count, 2,
309            "overflow tool calls should be bucketed under __other__"
310        );
311
312        // Verify one of the original tools is still tracked under its own name
313        let first_count = metrics
314            .tool_calls_total
315            .with_label_values(&["tool_0"])
316            .get();
317        assert_eq!(first_count, 1, "original tool should still have its label");
318
319        // Verify the known_tools set is capped
320        let known = metrics.known_tools.lock().unwrap();
321        assert_eq!(
322            known.len(),
323            MAX_TOOL_LABEL_CARDINALITY,
324            "known tools should be capped at MAX_TOOL_LABEL_CARDINALITY"
325        );
326
327        // Verify that a previously-known tool still gets its own label
328        // even after the cap is reached
329        drop(known);
330        metrics.record_tool_call("tool_0");
331        let first_count = metrics
332            .tool_calls_total
333            .with_label_values(&["tool_0"])
334            .get();
335        assert_eq!(
336            first_count, 2,
337            "repeated calls to known tools should still use original label"
338        );
339    }
340}