1use prometheus::{
9 Encoder, Histogram, HistogramOpts, IntCounter, IntCounterVec, IntGauge, Opts, Registry,
10 TextEncoder,
11};
12use std::collections::HashSet;
13use std::sync::Mutex;
14use thiserror::Error;
15
16const MAX_TOOL_LABEL_CARDINALITY: usize = 1000;
20
21#[derive(Debug, Error)]
23pub enum MetricsError {
24 #[error("prometheus error: {0}")]
25 Prometheus(#[from] prometheus::Error),
26}
27
28pub struct ArbiterMetrics {
30 registry: Registry,
31
32 pub requests_total: IntCounterVec,
34
35 pub tool_calls_total: IntCounterVec,
37
38 pub anomalies_total: IntCounter,
40
41 pub request_duration_seconds: Histogram,
43
44 pub upstream_duration_seconds: Histogram,
46
47 pub active_sessions: IntGauge,
49
50 pub registered_agents: IntGauge,
52
53 known_tools: Mutex<HashSet<String>>,
55}
56
57impl ArbiterMetrics {
58 pub fn new() -> Result<Self, MetricsError> {
60 let registry = Registry::new();
61 Self::with_registry(registry)
62 }
63
64 pub fn with_registry(registry: Registry) -> Result<Self, MetricsError> {
66 let requests_total = IntCounterVec::new(
67 Opts::new("requests_total", "Total requests by authorization decision"),
68 &["decision"],
69 )?;
70 registry.register(Box::new(requests_total.clone()))?;
71
72 let tool_calls_total = IntCounterVec::new(
73 Opts::new("tool_calls_total", "Total tool calls by tool name"),
74 &["tool"],
75 )?;
76 registry.register(Box::new(tool_calls_total.clone()))?;
77
78 let anomalies_total =
79 IntCounter::with_opts(Opts::new("anomalies_total", "Total anomalies detected"))?;
80 registry.register(Box::new(anomalies_total.clone()))?;
81
82 let request_duration_seconds = Histogram::with_opts(
83 HistogramOpts::new(
84 "request_duration_seconds",
85 "End-to-end request duration in seconds",
86 )
87 .buckets(vec![
88 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
89 ]),
90 )?;
91 registry.register(Box::new(request_duration_seconds.clone()))?;
92
93 let upstream_duration_seconds = Histogram::with_opts(
94 HistogramOpts::new(
95 "upstream_duration_seconds",
96 "Duration of upstream (forwarded) call in seconds",
97 )
98 .buckets(vec![
99 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
100 ]),
101 )?;
102 registry.register(Box::new(upstream_duration_seconds.clone()))?;
103
104 let active_sessions = IntGauge::with_opts(Opts::new(
105 "active_sessions",
106 "Currently active task sessions",
107 ))?;
108 registry.register(Box::new(active_sessions.clone()))?;
109
110 let registered_agents = IntGauge::with_opts(Opts::new(
111 "registered_agents",
112 "Currently registered agents",
113 ))?;
114 registry.register(Box::new(registered_agents.clone()))?;
115
116 Ok(Self {
117 registry,
118 requests_total,
119 tool_calls_total,
120 anomalies_total,
121 request_duration_seconds,
122 upstream_duration_seconds,
123 active_sessions,
124 registered_agents,
125 known_tools: Mutex::new(HashSet::new()),
126 })
127 }
128
129 pub fn record_request(&self, decision: &str) {
131 let sanitized: String = decision
133 .chars()
134 .take(64)
135 .filter(|c| c.is_ascii_alphanumeric() || *c == '_')
136 .collect();
137 self.requests_total.with_label_values(&[&sanitized]).inc();
138 }
139
140 pub fn record_tool_call(&self, tool: &str) {
142 let sanitized: String = tool
144 .chars()
145 .take(128)
146 .map(|c| {
147 if c.is_ascii_graphic() || c == ' ' {
148 c
149 } else {
150 '_'
151 }
152 })
153 .collect();
154
155 let label = {
156 let mut known = self.known_tools.lock().unwrap_or_else(|e| e.into_inner());
157 if known.contains(&sanitized) || known.len() < MAX_TOOL_LABEL_CARDINALITY {
158 known.insert(sanitized.clone());
159 sanitized
160 } else {
161 "__other__".to_string()
162 }
163 };
164 self.tool_calls_total.with_label_values(&[&label]).inc();
165 }
166
167 pub fn record_anomaly(&self) {
169 self.anomalies_total.inc();
170 }
171
172 pub fn observe_request_duration(&self, seconds: f64) {
174 self.request_duration_seconds.observe(seconds);
175 }
176
177 pub fn observe_upstream_duration(&self, seconds: f64) {
179 self.upstream_duration_seconds.observe(seconds);
180 }
181
182 pub fn render(&self) -> Result<String, MetricsError> {
184 let encoder = TextEncoder::new();
185 let metric_families = self.registry.gather();
186 let mut buffer = Vec::new();
187 encoder.encode(&metric_families, &mut buffer)?;
188 Ok(String::from_utf8_lossy(&buffer).into_owned())
189 }
190}
191
192#[cfg(test)]
193mod tests {
194 use super::*;
195
196 #[test]
197 fn counter_increments() {
198 let metrics = ArbiterMetrics::new().unwrap();
199
200 metrics.record_request("allow");
201 metrics.record_request("allow");
202 metrics.record_request("deny");
203 metrics.record_tool_call("read_file");
204 metrics.record_tool_call("read_file");
205 metrics.record_tool_call("write_file");
206 metrics.record_anomaly();
207
208 assert_eq!(
209 metrics.requests_total.with_label_values(&["allow"]).get(),
210 2
211 );
212 assert_eq!(metrics.requests_total.with_label_values(&["deny"]).get(), 1);
213 assert_eq!(
214 metrics
215 .tool_calls_total
216 .with_label_values(&["read_file"])
217 .get(),
218 2
219 );
220 assert_eq!(
221 metrics
222 .tool_calls_total
223 .with_label_values(&["write_file"])
224 .get(),
225 1
226 );
227 assert_eq!(metrics.anomalies_total.get(), 1);
228 }
229
230 #[test]
231 fn metrics_endpoint_returns_valid_prometheus_format() {
232 let metrics = ArbiterMetrics::new().unwrap();
233
234 metrics.record_request("allow");
235 metrics.record_tool_call("list_dir");
236 metrics.observe_request_duration(0.042);
237 metrics.observe_upstream_duration(0.035);
238 metrics.active_sessions.set(3);
239 metrics.registered_agents.set(5);
240
241 let output = metrics.render().unwrap();
242
243 assert!(output.contains("requests_total"));
245 assert!(output.contains("tool_calls_total"));
246 assert!(output.contains("anomalies_total"));
247 assert!(output.contains("request_duration_seconds"));
248 assert!(output.contains("upstream_duration_seconds"));
249 assert!(output.contains("active_sessions 3"));
250 assert!(output.contains("registered_agents 5"));
251
252 assert!(output.contains("# HELP requests_total"));
254 assert!(output.contains("# TYPE requests_total counter"));
255 assert!(output.contains("# HELP request_duration_seconds"));
256 assert!(output.contains("# TYPE request_duration_seconds histogram"));
257 }
258
259 #[test]
260 fn histogram_buckets_are_present() {
261 let metrics = ArbiterMetrics::new().unwrap();
262 metrics.observe_request_duration(0.05);
263
264 let output = metrics.render().unwrap();
265
266 assert!(output.contains("request_duration_seconds_bucket"));
268 assert!(output.contains("request_duration_seconds_sum"));
269 assert!(output.contains("request_duration_seconds_count"));
270 }
271
272 #[test]
273 fn gauges_can_increase_and_decrease() {
274 let metrics = ArbiterMetrics::new().unwrap();
275
276 metrics.active_sessions.set(10);
277 assert_eq!(metrics.active_sessions.get(), 10);
278
279 metrics.active_sessions.dec();
280 assert_eq!(metrics.active_sessions.get(), 9);
281
282 metrics.registered_agents.inc();
283 metrics.registered_agents.inc();
284 assert_eq!(metrics.registered_agents.get(), 2);
285 }
286
287 #[test]
290 fn cardinality_limiting_works() {
291 let metrics = ArbiterMetrics::new().unwrap();
292
293 for i in 0..MAX_TOOL_LABEL_CARDINALITY {
295 metrics.record_tool_call(&format!("tool_{i}"));
296 }
297
298 metrics.record_tool_call("tool_overflow_a");
300 metrics.record_tool_call("tool_overflow_b");
301
302 let other_count = metrics
304 .tool_calls_total
305 .with_label_values(&["__other__"])
306 .get();
307 assert_eq!(
308 other_count, 2,
309 "overflow tool calls should be bucketed under __other__"
310 );
311
312 let first_count = metrics
314 .tool_calls_total
315 .with_label_values(&["tool_0"])
316 .get();
317 assert_eq!(first_count, 1, "original tool should still have its label");
318
319 let known = metrics.known_tools.lock().unwrap();
321 assert_eq!(
322 known.len(),
323 MAX_TOOL_LABEL_CARDINALITY,
324 "known tools should be capped at MAX_TOOL_LABEL_CARDINALITY"
325 );
326
327 drop(known);
330 metrics.record_tool_call("tool_0");
331 let first_count = metrics
332 .tool_calls_total
333 .with_label_values(&["tool_0"])
334 .get();
335 assert_eq!(
336 first_count, 2,
337 "repeated calls to known tools should still use original label"
338 );
339 }
340}