claude_agent/observability/
metrics.rs

1//! Metrics collection and export.
2//!
3//! Provides built-in atomic metrics for local tracking, with optional
4//! OpenTelemetry export when the `otel` feature is enabled.
5
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::time::Duration;
8
9#[cfg(feature = "otel")]
10use super::otel::{OtelConfig, OtelMetricsBridge, SERVICE_NAME_DEFAULT};
11#[cfg(feature = "otel")]
12use opentelemetry::global;
13
14/// Metrics configuration.
15#[derive(Clone, Default)]
16pub struct MetricsConfig {
17    pub enabled: bool,
18    pub export_interval: Option<Duration>,
19}
20
21impl MetricsConfig {
22    pub fn new() -> Self {
23        Self {
24            enabled: true,
25            export_interval: Some(Duration::from_secs(60)),
26        }
27    }
28
29    pub fn disabled() -> Self {
30        Self {
31            enabled: false,
32            export_interval: None,
33        }
34    }
35}
36
37/// Thread-safe atomic counter.
38#[derive(Default)]
39pub struct Counter {
40    value: AtomicU64,
41}
42
43impl Counter {
44    pub fn new() -> Self {
45        Self::default()
46    }
47
48    pub fn inc(&self) {
49        self.value.fetch_add(1, Ordering::Relaxed);
50    }
51
52    pub fn add(&self, n: u64) {
53        self.value.fetch_add(n, Ordering::Relaxed);
54    }
55
56    pub fn get(&self) -> u64 {
57        self.value.load(Ordering::Relaxed)
58    }
59}
60
61/// Thread-safe atomic gauge.
62#[derive(Default)]
63pub struct Gauge {
64    value: AtomicU64,
65}
66
67impl Gauge {
68    pub fn new() -> Self {
69        Self::default()
70    }
71
72    pub fn set(&self, value: u64) {
73        self.value.store(value, Ordering::Relaxed);
74    }
75
76    pub fn inc(&self) {
77        self.value.fetch_add(1, Ordering::Relaxed);
78    }
79
80    pub fn dec(&self) {
81        self.value.fetch_sub(1, Ordering::Relaxed);
82    }
83
84    pub fn get(&self) -> u64 {
85        self.value.load(Ordering::Relaxed)
86    }
87}
88
89/// Simple histogram using fixed buckets.
90pub struct Histogram {
91    buckets: Vec<AtomicU64>,
92    bucket_bounds: Vec<f64>,
93    sum: AtomicU64,
94    count: AtomicU64,
95}
96
97impl Histogram {
98    pub fn new(bucket_bounds: Vec<f64>) -> Self {
99        let buckets = (0..=bucket_bounds.len())
100            .map(|_| AtomicU64::new(0))
101            .collect();
102        Self {
103            buckets,
104            bucket_bounds,
105            sum: AtomicU64::new(0),
106            count: AtomicU64::new(0),
107        }
108    }
109
110    pub fn default_latency() -> Self {
111        Self::new(vec![
112            10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0, 10000.0,
113        ])
114    }
115
116    pub fn observe(&self, value: f64) {
117        let bucket_idx = self
118            .bucket_bounds
119            .iter()
120            .position(|&bound| value <= bound)
121            .unwrap_or(self.bucket_bounds.len());
122
123        self.buckets[bucket_idx].fetch_add(1, Ordering::Relaxed);
124        self.sum.fetch_add(value as u64, Ordering::Relaxed);
125        self.count.fetch_add(1, Ordering::Relaxed);
126    }
127
128    pub fn count(&self) -> u64 {
129        self.count.load(Ordering::Relaxed)
130    }
131
132    pub fn sum(&self) -> u64 {
133        self.sum.load(Ordering::Relaxed)
134    }
135}
136
137/// Agent-specific metrics registry.
138///
139/// Tracks metrics locally with atomic counters, and optionally exports
140/// to OpenTelemetry when the `otel` feature is enabled.
141pub struct MetricsRegistry {
142    pub requests_total: Counter,
143    pub requests_success: Counter,
144    pub requests_error: Counter,
145    pub tokens_input: Counter,
146    pub tokens_output: Counter,
147    pub cache_read_tokens: Counter,
148    pub cache_creation_tokens: Counter,
149    pub tool_calls_total: Counter,
150    pub tool_errors: Counter,
151    pub active_sessions: Gauge,
152    pub request_latency_ms: Histogram,
153    pub cost_total_micros: Counter,
154    #[cfg(feature = "otel")]
155    otel_bridge: Option<OtelMetricsBridge>,
156}
157
158impl MetricsRegistry {
159    pub fn new(_config: &MetricsConfig) -> Self {
160        Self {
161            requests_total: Counter::new(),
162            requests_success: Counter::new(),
163            requests_error: Counter::new(),
164            tokens_input: Counter::new(),
165            tokens_output: Counter::new(),
166            cache_read_tokens: Counter::new(),
167            cache_creation_tokens: Counter::new(),
168            tool_calls_total: Counter::new(),
169            tool_errors: Counter::new(),
170            active_sessions: Gauge::new(),
171            request_latency_ms: Histogram::default_latency(),
172            cost_total_micros: Counter::new(),
173            #[cfg(feature = "otel")]
174            otel_bridge: None,
175        }
176    }
177
178    #[cfg(feature = "otel")]
179    pub fn with_otel(_config: &MetricsConfig, otel_config: &OtelConfig) -> Self {
180        let meter = global::meter(SERVICE_NAME_DEFAULT);
181        let bridge = OtelMetricsBridge::new(meter);
182        let _ = &otel_config.service_name; // Used for configuration, meter uses static name
183
184        Self {
185            requests_total: Counter::new(),
186            requests_success: Counter::new(),
187            requests_error: Counter::new(),
188            tokens_input: Counter::new(),
189            tokens_output: Counter::new(),
190            cache_read_tokens: Counter::new(),
191            cache_creation_tokens: Counter::new(),
192            tool_calls_total: Counter::new(),
193            tool_errors: Counter::new(),
194            active_sessions: Gauge::new(),
195            request_latency_ms: Histogram::default_latency(),
196            cost_total_micros: Counter::new(),
197            otel_bridge: Some(bridge),
198        }
199    }
200
201    pub fn record_request_start(&self) {
202        self.requests_total.inc();
203        self.active_sessions.inc();
204
205        #[cfg(feature = "otel")]
206        if let Some(ref bridge) = self.otel_bridge {
207            bridge.record_request_start();
208        }
209    }
210
211    pub fn record_request_end(&self, success: bool, latency_ms: f64) {
212        self.active_sessions.dec();
213        self.request_latency_ms.observe(latency_ms);
214        if success {
215            self.requests_success.inc();
216        } else {
217            self.requests_error.inc();
218        }
219
220        #[cfg(feature = "otel")]
221        if let Some(ref bridge) = self.otel_bridge {
222            bridge.record_request_end(success, latency_ms);
223        }
224    }
225
226    pub fn record_tokens(&self, input: u32, output: u32) {
227        self.tokens_input.add(input as u64);
228        self.tokens_output.add(output as u64);
229
230        #[cfg(feature = "otel")]
231        if let Some(ref bridge) = self.otel_bridge {
232            bridge.record_tokens(input as u64, output as u64);
233        }
234    }
235
236    pub fn record_cache(&self, read: u32, creation: u32) {
237        self.cache_read_tokens.add(read as u64);
238        self.cache_creation_tokens.add(creation as u64);
239
240        #[cfg(feature = "otel")]
241        if let Some(ref bridge) = self.otel_bridge {
242            bridge.record_cache(read as u64, creation as u64);
243        }
244    }
245
246    pub fn record_tool_call(&self, success: bool) {
247        self.tool_calls_total.inc();
248        if !success {
249            self.tool_errors.inc();
250        }
251
252        #[cfg(feature = "otel")]
253        if let Some(ref bridge) = self.otel_bridge {
254            bridge.record_tool_call(success);
255        }
256    }
257
258    pub fn record_cost(&self, cost_usd: f64) {
259        let micros = (cost_usd * 1_000_000.0) as u64;
260        self.cost_total_micros.add(micros);
261
262        #[cfg(feature = "otel")]
263        if let Some(ref bridge) = self.otel_bridge {
264            bridge.record_cost(cost_usd);
265        }
266    }
267
268    pub fn total_cost_usd(&self) -> f64 {
269        self.cost_total_micros.get() as f64 / 1_000_000.0
270    }
271}
272
273impl Default for MetricsRegistry {
274    fn default() -> Self {
275        Self::new(&MetricsConfig::default())
276    }
277}
278
279/// High-level metrics summary for an agent session.
280#[derive(Debug, Clone, Default)]
281pub struct AgentMetrics {
282    pub total_requests: u64,
283    pub successful_requests: u64,
284    pub failed_requests: u64,
285    pub total_input_tokens: u64,
286    pub total_output_tokens: u64,
287    pub cache_read_tokens: u64,
288    pub cache_creation_tokens: u64,
289    pub total_tool_calls: u64,
290    pub failed_tool_calls: u64,
291    pub total_cost_usd: f64,
292    pub avg_latency_ms: f64,
293}
294
295impl AgentMetrics {
296    pub fn from_registry(registry: &MetricsRegistry) -> Self {
297        let count = registry.request_latency_ms.count();
298        let avg_latency = if count > 0 {
299            registry.request_latency_ms.sum() as f64 / count as f64
300        } else {
301            0.0
302        };
303
304        Self {
305            total_requests: registry.requests_total.get(),
306            successful_requests: registry.requests_success.get(),
307            failed_requests: registry.requests_error.get(),
308            total_input_tokens: registry.tokens_input.get(),
309            total_output_tokens: registry.tokens_output.get(),
310            cache_read_tokens: registry.cache_read_tokens.get(),
311            cache_creation_tokens: registry.cache_creation_tokens.get(),
312            total_tool_calls: registry.tool_calls_total.get(),
313            failed_tool_calls: registry.tool_errors.get(),
314            total_cost_usd: registry.total_cost_usd(),
315            avg_latency_ms: avg_latency,
316        }
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    #[test]
325    fn test_counter() {
326        let counter = Counter::new();
327        assert_eq!(counter.get(), 0);
328        counter.inc();
329        assert_eq!(counter.get(), 1);
330        counter.add(5);
331        assert_eq!(counter.get(), 6);
332    }
333
334    #[test]
335    fn test_gauge() {
336        let gauge = Gauge::new();
337        gauge.set(10);
338        assert_eq!(gauge.get(), 10);
339        gauge.inc();
340        assert_eq!(gauge.get(), 11);
341        gauge.dec();
342        assert_eq!(gauge.get(), 10);
343    }
344
345    #[test]
346    fn test_histogram() {
347        let hist = Histogram::new(vec![10.0, 50.0, 100.0]);
348        hist.observe(5.0);
349        hist.observe(25.0);
350        hist.observe(75.0);
351        hist.observe(150.0);
352        assert_eq!(hist.count(), 4);
353    }
354
355    #[test]
356    fn test_metrics_registry() {
357        let registry = MetricsRegistry::default();
358        registry.record_request_start();
359        registry.record_tokens(100, 50);
360        registry.record_tool_call(true);
361        registry.record_cost(0.001);
362        registry.record_request_end(true, 250.0);
363
364        let metrics = AgentMetrics::from_registry(&registry);
365        assert_eq!(metrics.total_requests, 1);
366        assert_eq!(metrics.total_input_tokens, 100);
367        assert_eq!(metrics.total_output_tokens, 50);
368        assert_eq!(metrics.total_tool_calls, 1);
369    }
370}