mcpkit_server/
metrics.rs

1//! Server-level metrics for MCP servers.
2//!
3//! This module provides request-level metrics tracking for MCP servers,
4//! complementing the transport-level telemetry in `mcpkit_transport::telemetry`.
5//!
6//! # Example
7//!
8//! ```rust
9//! use mcpkit_server::metrics::ServerMetrics;
10//!
11//! let metrics = ServerMetrics::new();
12//!
13//! // Record a request
14//! metrics.record_request("tools/call", std::time::Duration::from_millis(50), true);
15//!
16//! // Get statistics
17//! let stats = metrics.snapshot();
18//! println!("Total requests: {}", stats.total_requests);
19//! println!("Error rate: {:.2}%", stats.error_rate() * 100.0);
20//! ```
21
22use std::collections::HashMap;
23use std::sync::RwLock;
24use std::sync::atomic::{AtomicU64, Ordering};
25use std::time::Duration;
26
27/// Server metrics collector.
28///
29/// Tracks request counts, latencies, and errors at the MCP method level.
30/// All operations are thread-safe and lock-free where possible.
31#[derive(Debug, Default)]
32pub struct ServerMetrics {
33    /// Total requests received.
34    total_requests: AtomicU64,
35    /// Total successful requests.
36    successful_requests: AtomicU64,
37    /// Total failed requests.
38    failed_requests: AtomicU64,
39    /// Total latency in microseconds (for average calculation).
40    total_latency_us: AtomicU64,
41    /// Per-method request counts.
42    method_counts: RwLock<HashMap<String, AtomicU64>>,
43    /// Per-method error counts.
44    method_errors: RwLock<HashMap<String, AtomicU64>>,
45    /// Per-method total latency in microseconds.
46    method_latency_us: RwLock<HashMap<String, AtomicU64>>,
47}
48
49impl ServerMetrics {
50    /// Create a new metrics collector.
51    #[must_use]
52    pub fn new() -> Self {
53        Self::default()
54    }
55
56    /// Record a request.
57    ///
58    /// # Arguments
59    ///
60    /// * `method` - The MCP method name (e.g., "tools/call", "resources/read")
61    /// * `duration` - How long the request took
62    /// * `success` - Whether the request succeeded
63    pub fn record_request(&self, method: &str, duration: Duration, success: bool) {
64        let latency_us = duration.as_micros() as u64;
65
66        // Update global counters
67        self.total_requests.fetch_add(1, Ordering::Relaxed);
68        self.total_latency_us
69            .fetch_add(latency_us, Ordering::Relaxed);
70
71        if success {
72            self.successful_requests.fetch_add(1, Ordering::Relaxed);
73        } else {
74            self.failed_requests.fetch_add(1, Ordering::Relaxed);
75        }
76
77        // Update per-method counters
78        self.increment_method_counter(&self.method_counts, method);
79        self.add_method_latency(method, latency_us);
80
81        if !success {
82            self.increment_method_counter(&self.method_errors, method);
83        }
84    }
85
86    /// Record a successful request (convenience method).
87    pub fn record_success(&self, method: &str, duration: Duration) {
88        self.record_request(method, duration, true);
89    }
90
91    /// Record a failed request (convenience method).
92    pub fn record_failure(&self, method: &str, duration: Duration) {
93        self.record_request(method, duration, false);
94    }
95
96    /// Get a snapshot of current metrics.
97    #[must_use]
98    pub fn snapshot(&self) -> MetricsSnapshot {
99        let method_counts = self
100            .method_counts
101            .read()
102            .unwrap_or_else(std::sync::PoisonError::into_inner);
103        let method_errors = self
104            .method_errors
105            .read()
106            .unwrap_or_else(std::sync::PoisonError::into_inner);
107        let method_latency = self
108            .method_latency_us
109            .read()
110            .unwrap_or_else(std::sync::PoisonError::into_inner);
111
112        let per_method: HashMap<String, MethodStats> = method_counts
113            .iter()
114            .map(|(method, count)| {
115                let requests = count.load(Ordering::Relaxed);
116                let errors = method_errors
117                    .get(method)
118                    .map_or(0, |c| c.load(Ordering::Relaxed));
119                let latency_us = method_latency
120                    .get(method)
121                    .map_or(0, |c| c.load(Ordering::Relaxed));
122
123                (
124                    method.clone(),
125                    MethodStats {
126                        requests,
127                        errors,
128                        avg_latency_ms: if requests > 0 {
129                            (latency_us as f64 / requests as f64) / 1000.0
130                        } else {
131                            0.0
132                        },
133                    },
134                )
135            })
136            .collect();
137
138        let total = self.total_requests.load(Ordering::Relaxed);
139        let total_latency = self.total_latency_us.load(Ordering::Relaxed);
140
141        MetricsSnapshot {
142            total_requests: total,
143            successful_requests: self.successful_requests.load(Ordering::Relaxed),
144            failed_requests: self.failed_requests.load(Ordering::Relaxed),
145            avg_latency_ms: if total > 0 {
146                (total_latency as f64 / total as f64) / 1000.0
147            } else {
148                0.0
149            },
150            per_method,
151        }
152    }
153
154    /// Reset all metrics to zero.
155    pub fn reset(&self) {
156        self.total_requests.store(0, Ordering::Relaxed);
157        self.successful_requests.store(0, Ordering::Relaxed);
158        self.failed_requests.store(0, Ordering::Relaxed);
159        self.total_latency_us.store(0, Ordering::Relaxed);
160
161        if let Ok(mut counts) = self.method_counts.write() {
162            counts.clear();
163        }
164        if let Ok(mut errors) = self.method_errors.write() {
165            errors.clear();
166        }
167        if let Ok(mut latency) = self.method_latency_us.write() {
168            latency.clear();
169        }
170    }
171
172    fn increment_method_counter(&self, map: &RwLock<HashMap<String, AtomicU64>>, method: &str) {
173        // Try to increment existing counter
174        if let Ok(counts) = map.read() {
175            if let Some(counter) = counts.get(method) {
176                counter.fetch_add(1, Ordering::Relaxed);
177                return;
178            }
179        }
180
181        // Counter doesn't exist, need to create it
182        if let Ok(mut counts) = map.write() {
183            counts
184                .entry(method.to_string())
185                .or_insert_with(|| AtomicU64::new(0))
186                .fetch_add(1, Ordering::Relaxed);
187        }
188    }
189
190    fn add_method_latency(&self, method: &str, latency_us: u64) {
191        // Try to add to existing counter
192        if let Ok(latencies) = self.method_latency_us.read() {
193            if let Some(counter) = latencies.get(method) {
194                counter.fetch_add(latency_us, Ordering::Relaxed);
195                return;
196            }
197        }
198
199        // Counter doesn't exist, need to create it
200        if let Ok(mut latencies) = self.method_latency_us.write() {
201            latencies
202                .entry(method.to_string())
203                .or_insert_with(|| AtomicU64::new(0))
204                .fetch_add(latency_us, Ordering::Relaxed);
205        }
206    }
207}
208
209/// A point-in-time snapshot of server metrics.
210#[derive(Debug, Clone)]
211pub struct MetricsSnapshot {
212    /// Total requests received.
213    pub total_requests: u64,
214    /// Total successful requests.
215    pub successful_requests: u64,
216    /// Total failed requests.
217    pub failed_requests: u64,
218    /// Average request latency in milliseconds.
219    pub avg_latency_ms: f64,
220    /// Per-method statistics.
221    pub per_method: HashMap<String, MethodStats>,
222}
223
224impl MetricsSnapshot {
225    /// Calculate the error rate (0.0 to 1.0).
226    #[must_use]
227    pub fn error_rate(&self) -> f64 {
228        if self.total_requests == 0 {
229            0.0
230        } else {
231            self.failed_requests as f64 / self.total_requests as f64
232        }
233    }
234
235    /// Calculate the success rate (0.0 to 1.0).
236    #[must_use]
237    pub fn success_rate(&self) -> f64 {
238        1.0 - self.error_rate()
239    }
240
241    /// Get statistics for a specific method.
242    #[must_use]
243    pub fn method(&self, name: &str) -> Option<&MethodStats> {
244        self.per_method.get(name)
245    }
246
247    /// Get the most called methods, sorted by request count.
248    #[must_use]
249    pub fn top_methods(&self, limit: usize) -> Vec<(&String, &MethodStats)> {
250        let mut methods: Vec<_> = self.per_method.iter().collect();
251        methods.sort_by(|a, b| b.1.requests.cmp(&a.1.requests));
252        methods.into_iter().take(limit).collect()
253    }
254
255    /// Get methods with highest error rates.
256    #[must_use]
257    pub fn most_errors(&self, limit: usize) -> Vec<(&String, &MethodStats)> {
258        let mut methods: Vec<_> = self
259            .per_method
260            .iter()
261            .filter(|(_, s)| s.errors > 0)
262            .collect();
263        methods.sort_by(|a, b| b.1.error_rate().total_cmp(&a.1.error_rate()));
264        methods.into_iter().take(limit).collect()
265    }
266
267    /// Get methods with highest average latency.
268    #[must_use]
269    pub fn slowest_methods(&self, limit: usize) -> Vec<(&String, &MethodStats)> {
270        let mut methods: Vec<_> = self.per_method.iter().collect();
271        methods.sort_by(|a, b| b.1.avg_latency_ms.total_cmp(&a.1.avg_latency_ms));
272        methods.into_iter().take(limit).collect()
273    }
274}
275
276/// Statistics for a single MCP method.
277#[derive(Debug, Clone)]
278pub struct MethodStats {
279    /// Total requests for this method.
280    pub requests: u64,
281    /// Total errors for this method.
282    pub errors: u64,
283    /// Average latency in milliseconds.
284    pub avg_latency_ms: f64,
285}
286
287impl MethodStats {
288    /// Calculate the error rate for this method.
289    #[must_use]
290    pub fn error_rate(&self) -> f64 {
291        if self.requests == 0 {
292            0.0
293        } else {
294            self.errors as f64 / self.requests as f64
295        }
296    }
297
298    /// Calculate the success rate for this method.
299    #[must_use]
300    pub fn success_rate(&self) -> f64 {
301        1.0 - self.error_rate()
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn test_basic_metrics() {
311        let metrics = ServerMetrics::new();
312
313        metrics.record_success("tools/call", Duration::from_millis(50));
314        metrics.record_success("tools/call", Duration::from_millis(100));
315        metrics.record_failure("tools/call", Duration::from_millis(25));
316        metrics.record_success("resources/read", Duration::from_millis(10));
317
318        let snapshot = metrics.snapshot();
319
320        assert_eq!(snapshot.total_requests, 4);
321        assert_eq!(snapshot.successful_requests, 3);
322        assert_eq!(snapshot.failed_requests, 1);
323        assert!((snapshot.error_rate() - 0.25).abs() < 0.01);
324    }
325
326    #[test]
327    fn test_per_method_stats() {
328        let metrics = ServerMetrics::new();
329
330        metrics.record_success("tools/call", Duration::from_millis(100));
331        metrics.record_success("tools/call", Duration::from_millis(100));
332        metrics.record_failure("tools/call", Duration::from_millis(100));
333
334        let snapshot = metrics.snapshot();
335        let tools_stats = snapshot.method("tools/call").unwrap();
336
337        assert_eq!(tools_stats.requests, 3);
338        assert_eq!(tools_stats.errors, 1);
339        assert!((tools_stats.avg_latency_ms - 100.0).abs() < 1.0);
340    }
341
342    #[test]
343    fn test_reset() {
344        let metrics = ServerMetrics::new();
345
346        metrics.record_success("test", Duration::from_millis(50));
347        assert_eq!(metrics.snapshot().total_requests, 1);
348
349        metrics.reset();
350        assert_eq!(metrics.snapshot().total_requests, 0);
351    }
352
353    #[test]
354    fn test_top_methods() {
355        let metrics = ServerMetrics::new();
356
357        for _ in 0..10 {
358            metrics.record_success("tools/call", Duration::from_millis(10));
359        }
360        for _ in 0..5 {
361            metrics.record_success("resources/read", Duration::from_millis(10));
362        }
363        for _ in 0..3 {
364            metrics.record_success("prompts/get", Duration::from_millis(10));
365        }
366
367        let snapshot = metrics.snapshot();
368        let top = snapshot.top_methods(2);
369
370        assert_eq!(top.len(), 2);
371        assert_eq!(top[0].0, "tools/call");
372        assert_eq!(top[1].0, "resources/read");
373    }
374}