Skip to main content

fraiseql_server/
metrics_server.rs

1//! Prometheus metrics for observability.
2//!
3//! Tracks:
4//! - GraphQL query execution time
5//! - Query success/error rates
6//! - Database query performance
7//! - Connection pool statistics
8//! - HTTP request/response metrics
9
10use std::{
11    sync::{
12        Arc,
13        atomic::{AtomicU64, Ordering},
14    },
15    time::Instant,
16};
17
18/// Metrics collector for the server.
19#[derive(Debug, Clone)]
20pub struct MetricsCollector {
21    /// Total GraphQL queries executed
22    pub queries_total: Arc<AtomicU64>,
23
24    /// Total successful queries
25    pub queries_success: Arc<AtomicU64>,
26
27    /// Total failed queries
28    pub queries_error: Arc<AtomicU64>,
29
30    /// Total query execution time (microseconds)
31    pub queries_duration_us: Arc<AtomicU64>,
32
33    /// Total database queries executed
34    pub db_queries_total: Arc<AtomicU64>,
35
36    /// Total database query time (microseconds)
37    pub db_queries_duration_us: Arc<AtomicU64>,
38
39    /// Total validation errors
40    pub validation_errors_total: Arc<AtomicU64>,
41
42    /// Total parse errors
43    pub parse_errors_total: Arc<AtomicU64>,
44
45    /// Total execution errors
46    pub execution_errors_total: Arc<AtomicU64>,
47
48    /// Total HTTP requests
49    pub http_requests_total: Arc<AtomicU64>,
50
51    /// Total HTTP 2xx responses
52    pub http_responses_2xx: Arc<AtomicU64>,
53
54    /// Total HTTP 4xx responses
55    pub http_responses_4xx: Arc<AtomicU64>,
56
57    /// Total HTTP 5xx responses
58    pub http_responses_5xx: Arc<AtomicU64>,
59
60    /// Cache hits
61    pub cache_hits: Arc<AtomicU64>,
62
63    /// Cache misses
64    pub cache_misses: Arc<AtomicU64>,
65
66    // Federation Metrics
67    /// Federation entity resolutions (total)
68    pub federation_entity_resolutions_total: Arc<AtomicU64>,
69
70    /// Federation entity resolutions (errors)
71    pub federation_entity_resolutions_errors: Arc<AtomicU64>,
72
73    /// Federation entity resolution duration (microseconds)
74    pub federation_entity_resolution_duration_us: Arc<AtomicU64>,
75
76    /// Federation subgraph requests (total)
77    pub federation_subgraph_requests_total: Arc<AtomicU64>,
78
79    /// Federation subgraph requests (errors)
80    pub federation_subgraph_requests_errors: Arc<AtomicU64>,
81
82    /// Federation subgraph request duration (microseconds)
83    pub federation_subgraph_request_duration_us: Arc<AtomicU64>,
84
85    /// Federation mutations (total)
86    pub federation_mutations_total: Arc<AtomicU64>,
87
88    /// Federation mutations (errors)
89    pub federation_mutations_errors: Arc<AtomicU64>,
90
91    /// Federation mutation duration (microseconds)
92    pub federation_mutation_duration_us: Arc<AtomicU64>,
93
94    /// Federation entity cache hits
95    pub federation_entity_cache_hits: Arc<AtomicU64>,
96
97    /// Federation entity cache misses
98    pub federation_entity_cache_misses: Arc<AtomicU64>,
99
100    /// Federation errors
101    pub federation_errors_total: Arc<AtomicU64>,
102}
103
104impl MetricsCollector {
105    /// Create new metrics collector.
106    #[must_use]
107    pub fn new() -> Self {
108        Self {
109            queries_total: Arc::new(AtomicU64::new(0)),
110            queries_success: Arc::new(AtomicU64::new(0)),
111            queries_error: Arc::new(AtomicU64::new(0)),
112            queries_duration_us: Arc::new(AtomicU64::new(0)),
113            db_queries_total: Arc::new(AtomicU64::new(0)),
114            db_queries_duration_us: Arc::new(AtomicU64::new(0)),
115            validation_errors_total: Arc::new(AtomicU64::new(0)),
116            parse_errors_total: Arc::new(AtomicU64::new(0)),
117            execution_errors_total: Arc::new(AtomicU64::new(0)),
118            http_requests_total: Arc::new(AtomicU64::new(0)),
119            http_responses_2xx: Arc::new(AtomicU64::new(0)),
120            http_responses_4xx: Arc::new(AtomicU64::new(0)),
121            http_responses_5xx: Arc::new(AtomicU64::new(0)),
122            cache_hits: Arc::new(AtomicU64::new(0)),
123            cache_misses: Arc::new(AtomicU64::new(0)),
124            federation_entity_resolutions_total: Arc::new(AtomicU64::new(0)),
125            federation_entity_resolutions_errors: Arc::new(AtomicU64::new(0)),
126            federation_entity_resolution_duration_us: Arc::new(AtomicU64::new(0)),
127            federation_subgraph_requests_total: Arc::new(AtomicU64::new(0)),
128            federation_subgraph_requests_errors: Arc::new(AtomicU64::new(0)),
129            federation_subgraph_request_duration_us: Arc::new(AtomicU64::new(0)),
130            federation_mutations_total: Arc::new(AtomicU64::new(0)),
131            federation_mutations_errors: Arc::new(AtomicU64::new(0)),
132            federation_mutation_duration_us: Arc::new(AtomicU64::new(0)),
133            federation_entity_cache_hits: Arc::new(AtomicU64::new(0)),
134            federation_entity_cache_misses: Arc::new(AtomicU64::new(0)),
135            federation_errors_total: Arc::new(AtomicU64::new(0)),
136        }
137    }
138}
139
140impl MetricsCollector {
141    /// Record entity resolution completion (all strategies).
142    ///
143    /// # Arguments
144    ///
145    /// * `duration_us` - Resolution duration in microseconds
146    /// * `success` - Whether resolution succeeded
147    pub fn record_entity_resolution(&self, duration_us: u64, success: bool) {
148        self.federation_entity_resolutions_total.fetch_add(1, Ordering::Relaxed);
149        self.federation_entity_resolution_duration_us
150            .fetch_add(duration_us, Ordering::Relaxed);
151        if !success {
152            self.federation_entity_resolutions_errors.fetch_add(1, Ordering::Relaxed);
153            self.federation_errors_total.fetch_add(1, Ordering::Relaxed);
154        }
155    }
156
157    /// Record subgraph request completion.
158    ///
159    /// # Arguments
160    ///
161    /// * `duration_us` - Request duration in microseconds
162    /// * `success` - Whether request succeeded (HTTP 2xx)
163    pub fn record_subgraph_request(&self, duration_us: u64, success: bool) {
164        self.federation_subgraph_requests_total.fetch_add(1, Ordering::Relaxed);
165        self.federation_subgraph_request_duration_us
166            .fetch_add(duration_us, Ordering::Relaxed);
167        if !success {
168            self.federation_subgraph_requests_errors.fetch_add(1, Ordering::Relaxed);
169            self.federation_errors_total.fetch_add(1, Ordering::Relaxed);
170        }
171    }
172
173    /// Record federation mutation execution.
174    ///
175    /// # Arguments
176    ///
177    /// * `duration_us` - Mutation duration in microseconds
178    /// * `success` - Whether mutation succeeded
179    pub fn record_mutation(&self, duration_us: u64, success: bool) {
180        self.federation_mutations_total.fetch_add(1, Ordering::Relaxed);
181        self.federation_mutation_duration_us.fetch_add(duration_us, Ordering::Relaxed);
182        if !success {
183            self.federation_mutations_errors.fetch_add(1, Ordering::Relaxed);
184            self.federation_errors_total.fetch_add(1, Ordering::Relaxed);
185        }
186    }
187
188    /// Record entity cache hit.
189    pub fn record_entity_cache_hit(&self) {
190        self.federation_entity_cache_hits.fetch_add(1, Ordering::Relaxed);
191    }
192
193    /// Record entity cache miss.
194    pub fn record_entity_cache_miss(&self) {
195        self.federation_entity_cache_misses.fetch_add(1, Ordering::Relaxed);
196    }
197}
198
199impl Default for MetricsCollector {
200    fn default() -> Self {
201        Self::new()
202    }
203}
204
205/// Guard for timing metrics.
206pub struct TimingGuard {
207    start:           Instant,
208    duration_atomic: Arc<AtomicU64>,
209}
210
211impl TimingGuard {
212    /// Create new timing guard.
213    pub fn new(duration_atomic: Arc<AtomicU64>) -> Self {
214        Self {
215            start: Instant::now(),
216            duration_atomic,
217        }
218    }
219
220    /// Record duration in microseconds and consume guard.
221    pub fn record(self) {
222        let duration_us = self.start.elapsed().as_micros() as u64;
223        self.duration_atomic.fetch_add(duration_us, Ordering::Relaxed);
224    }
225}
226
227/// Prometheus metrics output format.
228#[derive(Debug)]
229pub struct PrometheusMetrics {
230    /// Total GraphQL queries executed
231    pub queries_total:              u64,
232    /// Successful GraphQL queries
233    pub queries_success:            u64,
234    /// Failed GraphQL queries
235    pub queries_error:              u64,
236    /// Average query duration in milliseconds
237    pub queries_avg_duration_ms:    f64,
238    /// Total database queries executed
239    pub db_queries_total:           u64,
240    /// Average database query duration in milliseconds
241    pub db_queries_avg_duration_ms: f64,
242    /// Total validation errors
243    pub validation_errors_total:    u64,
244    /// Total parse errors
245    pub parse_errors_total:         u64,
246    /// Total execution errors
247    pub execution_errors_total:     u64,
248    /// Total HTTP requests processed
249    pub http_requests_total:        u64,
250    /// HTTP 2xx responses
251    pub http_responses_2xx:         u64,
252    /// HTTP 4xx responses
253    pub http_responses_4xx:         u64,
254    /// HTTP 5xx responses
255    pub http_responses_5xx:         u64,
256    /// Cache hit count
257    pub cache_hits:                 u64,
258    /// Cache miss count
259    pub cache_misses:               u64,
260    /// Cache hit ratio (0.0 to 1.0)
261    pub cache_hit_ratio:            f64,
262}
263
264impl PrometheusMetrics {
265    /// Generate Prometheus text format output.
266    #[must_use]
267    pub fn to_prometheus_format(&self) -> String {
268        format!(
269            r"# HELP fraiseql_graphql_queries_total Total GraphQL queries executed
270# TYPE fraiseql_graphql_queries_total counter
271fraiseql_graphql_queries_total {}
272
273# HELP fraiseql_graphql_queries_success Total successful GraphQL queries
274# TYPE fraiseql_graphql_queries_success counter
275fraiseql_graphql_queries_success {}
276
277# HELP fraiseql_graphql_queries_error Total failed GraphQL queries
278# TYPE fraiseql_graphql_queries_error counter
279fraiseql_graphql_queries_error {}
280
281# HELP fraiseql_graphql_query_duration_ms Average query execution time in milliseconds
282# TYPE fraiseql_graphql_query_duration_ms gauge
283fraiseql_graphql_query_duration_ms {}
284
285# HELP fraiseql_database_queries_total Total database queries executed
286# TYPE fraiseql_database_queries_total counter
287fraiseql_database_queries_total {}
288
289# HELP fraiseql_database_query_duration_ms Average database query time in milliseconds
290# TYPE fraiseql_database_query_duration_ms gauge
291fraiseql_database_query_duration_ms {}
292
293# HELP fraiseql_validation_errors_total Total validation errors
294# TYPE fraiseql_validation_errors_total counter
295fraiseql_validation_errors_total {}
296
297# HELP fraiseql_parse_errors_total Total parse errors
298# TYPE fraiseql_parse_errors_total counter
299fraiseql_parse_errors_total {}
300
301# HELP fraiseql_execution_errors_total Total execution errors
302# TYPE fraiseql_execution_errors_total counter
303fraiseql_execution_errors_total {}
304
305# HELP fraiseql_http_requests_total Total HTTP requests
306# TYPE fraiseql_http_requests_total counter
307fraiseql_http_requests_total {}
308
309# HELP fraiseql_http_responses_2xx Total 2xx HTTP responses
310# TYPE fraiseql_http_responses_2xx counter
311fraiseql_http_responses_2xx {}
312
313# HELP fraiseql_http_responses_4xx Total 4xx HTTP responses
314# TYPE fraiseql_http_responses_4xx counter
315fraiseql_http_responses_4xx {}
316
317# HELP fraiseql_http_responses_5xx Total 5xx HTTP responses
318# TYPE fraiseql_http_responses_5xx counter
319fraiseql_http_responses_5xx {}
320
321# HELP fraiseql_cache_hits Total cache hits
322# TYPE fraiseql_cache_hits counter
323fraiseql_cache_hits {}
324
325# HELP fraiseql_cache_misses Total cache misses
326# TYPE fraiseql_cache_misses counter
327fraiseql_cache_misses {}
328
329# HELP fraiseql_cache_hit_ratio Cache hit ratio (0-1)
330# TYPE fraiseql_cache_hit_ratio gauge
331fraiseql_cache_hit_ratio {:.3}
332",
333            self.queries_total,
334            self.queries_success,
335            self.queries_error,
336            self.queries_avg_duration_ms,
337            self.db_queries_total,
338            self.db_queries_avg_duration_ms,
339            self.validation_errors_total,
340            self.parse_errors_total,
341            self.execution_errors_total,
342            self.http_requests_total,
343            self.http_responses_2xx,
344            self.http_responses_4xx,
345            self.http_responses_5xx,
346            self.cache_hits,
347            self.cache_misses,
348            self.cache_hit_ratio,
349        )
350    }
351}
352
353impl From<&MetricsCollector> for PrometheusMetrics {
354    fn from(collector: &MetricsCollector) -> Self {
355        let queries_total = collector.queries_total.load(Ordering::Relaxed);
356        let queries_success = collector.queries_success.load(Ordering::Relaxed);
357        let queries_error = collector.queries_error.load(Ordering::Relaxed);
358        let queries_duration_us = collector.queries_duration_us.load(Ordering::Relaxed);
359
360        let db_queries_total = collector.db_queries_total.load(Ordering::Relaxed);
361        let db_queries_duration_us = collector.db_queries_duration_us.load(Ordering::Relaxed);
362
363        let cache_hits = collector.cache_hits.load(Ordering::Relaxed);
364        let cache_misses = collector.cache_misses.load(Ordering::Relaxed);
365        let cache_total = cache_hits + cache_misses;
366
367        Self {
368            queries_total,
369            queries_success,
370            queries_error,
371            queries_avg_duration_ms: if queries_total > 0 {
372                (queries_duration_us as f64 / queries_total as f64) / 1000.0
373            } else {
374                0.0
375            },
376            db_queries_total,
377            db_queries_avg_duration_ms: if db_queries_total > 0 {
378                (db_queries_duration_us as f64 / db_queries_total as f64) / 1000.0
379            } else {
380                0.0
381            },
382            validation_errors_total: collector.validation_errors_total.load(Ordering::Relaxed),
383            parse_errors_total: collector.parse_errors_total.load(Ordering::Relaxed),
384            execution_errors_total: collector.execution_errors_total.load(Ordering::Relaxed),
385            http_requests_total: collector.http_requests_total.load(Ordering::Relaxed),
386            http_responses_2xx: collector.http_responses_2xx.load(Ordering::Relaxed),
387            http_responses_4xx: collector.http_responses_4xx.load(Ordering::Relaxed),
388            http_responses_5xx: collector.http_responses_5xx.load(Ordering::Relaxed),
389            cache_hits,
390            cache_misses,
391            cache_hit_ratio: if cache_total > 0 {
392                cache_hits as f64 / cache_total as f64
393            } else {
394                0.0
395            },
396        }
397    }
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn test_metrics_collector_creation() {
406        let collector = MetricsCollector::new();
407        assert_eq!(collector.queries_total.load(Ordering::Relaxed), 0);
408        assert_eq!(collector.queries_success.load(Ordering::Relaxed), 0);
409    }
410
411    #[test]
412    fn test_metrics_increment() {
413        let collector = MetricsCollector::new();
414        collector.queries_total.fetch_add(5, Ordering::Relaxed);
415        collector.queries_success.fetch_add(4, Ordering::Relaxed);
416        collector.queries_error.fetch_add(1, Ordering::Relaxed);
417
418        assert_eq!(collector.queries_total.load(Ordering::Relaxed), 5);
419        assert_eq!(collector.queries_success.load(Ordering::Relaxed), 4);
420        assert_eq!(collector.queries_error.load(Ordering::Relaxed), 1);
421    }
422
423    #[test]
424    fn test_prometheus_output_format() {
425        let collector = MetricsCollector::new();
426        collector.queries_total.store(100, Ordering::Relaxed);
427        collector.queries_success.store(95, Ordering::Relaxed);
428        collector.queries_error.store(5, Ordering::Relaxed);
429
430        let metrics = PrometheusMetrics::from(&collector);
431        let output = metrics.to_prometheus_format();
432
433        assert!(output.contains("fraiseql_graphql_queries_total 100"));
434        assert!(output.contains("fraiseql_graphql_queries_success 95"));
435        assert!(output.contains("fraiseql_graphql_queries_error 5"));
436        assert!(output.contains("# HELP"));
437        assert!(output.contains("# TYPE"));
438    }
439
440    #[test]
441    fn test_timing_guard() {
442        let duration_atomic = Arc::new(AtomicU64::new(0));
443        let guard = TimingGuard::new(duration_atomic.clone());
444
445        // Add a small delay to ensure measurable time
446        std::thread::sleep(std::time::Duration::from_micros(100));
447        guard.record();
448
449        let recorded = duration_atomic.load(Ordering::Relaxed);
450        assert!(recorded >= 100);
451        assert!(recorded < 1_000_000); // Should be less than 1 second
452    }
453
454    #[test]
455    fn test_cache_hit_ratio_calculation() {
456        let collector = MetricsCollector::new();
457        collector.cache_hits.store(75, Ordering::Relaxed);
458        collector.cache_misses.store(25, Ordering::Relaxed);
459
460        let metrics = PrometheusMetrics::from(&collector);
461        assert!((metrics.cache_hit_ratio - 0.75).abs() < 0.001);
462    }
463
464    #[test]
465    fn test_average_duration_calculation() {
466        let collector = MetricsCollector::new();
467        collector.queries_total.store(10, Ordering::Relaxed);
468        collector.queries_duration_us.store(50_000, Ordering::Relaxed); // 50ms total
469
470        let metrics = PrometheusMetrics::from(&collector);
471        assert!((metrics.queries_avg_duration_ms - 5.0).abs() < 0.01); // 5ms average
472    }
473}