mockforge_observability/prometheus/
metrics.rs

1//! Prometheus metrics definitions and registry
2
3use once_cell::sync::Lazy;
4use prometheus::{
5    Gauge, GaugeVec, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
6    Opts, Registry,
7};
8use std::sync::Arc;
9use tracing::debug;
10
11/// Global metrics registry for MockForge
12#[derive(Clone)]
13pub struct MetricsRegistry {
14    registry: Arc<Registry>,
15
16    // Request metrics by protocol
17    pub requests_total: IntCounterVec,
18    pub requests_duration_seconds: HistogramVec,
19    pub requests_in_flight: IntGaugeVec,
20
21    // Request metrics by path (endpoint-specific)
22    pub requests_by_path_total: IntCounterVec,
23    pub request_duration_by_path_seconds: HistogramVec,
24    pub average_latency_by_path_seconds: GaugeVec,
25
26    // Workspace-specific metrics
27    pub workspace_requests_total: IntCounterVec,
28    pub workspace_requests_duration_seconds: HistogramVec,
29    pub workspace_active_routes: IntGaugeVec,
30    pub workspace_errors_total: IntCounterVec,
31
32    // Error metrics
33    pub errors_total: IntCounterVec,
34    pub error_rate: GaugeVec,
35
36    // Plugin metrics
37    pub plugin_executions_total: IntCounterVec,
38    pub plugin_execution_duration_seconds: HistogramVec,
39    pub plugin_errors_total: IntCounterVec,
40
41    // WebSocket specific metrics
42    pub ws_connections_active: IntGauge,
43    pub ws_connections_total: IntCounter,
44    pub ws_connection_duration_seconds: HistogramVec,
45    pub ws_messages_sent: IntCounter,
46    pub ws_messages_received: IntCounter,
47    pub ws_errors_total: IntCounter,
48
49    // SMTP specific metrics
50    pub smtp_connections_active: IntGauge,
51    pub smtp_connections_total: IntCounter,
52    pub smtp_messages_received_total: IntCounter,
53    pub smtp_messages_stored_total: IntCounter,
54    pub smtp_errors_total: IntCounterVec,
55
56    // MQTT specific metrics
57    pub mqtt_connections_active: IntGauge,
58    pub mqtt_connections_total: IntCounter,
59    pub mqtt_messages_published_total: IntCounter,
60    pub mqtt_messages_received_total: IntCounter,
61    pub mqtt_topics_active: IntGauge,
62    pub mqtt_subscriptions_active: IntGauge,
63    pub mqtt_retained_messages: IntGauge,
64    pub mqtt_errors_total: IntCounterVec,
65
66    // System metrics
67    pub memory_usage_bytes: Gauge,
68    pub cpu_usage_percent: Gauge,
69    pub thread_count: Gauge,
70    pub uptime_seconds: Gauge,
71
72    // Scenario metrics (for Phase 4)
73    pub active_scenario_mode: IntGauge,
74    pub chaos_triggers_total: IntCounter,
75
76    // Business/SLO metrics
77    pub service_availability: GaugeVec,
78    pub slo_compliance: GaugeVec,
79    pub successful_request_rate: GaugeVec,
80    pub p95_latency_slo_compliance: GaugeVec,
81    pub error_budget_remaining: GaugeVec,
82}
83
84impl MetricsRegistry {
85    /// Create a new metrics registry with all metrics initialized
86    pub fn new() -> Self {
87        let registry = Registry::new();
88
89        // Request metrics
90        let requests_total = IntCounterVec::new(
91            Opts::new(
92                "mockforge_requests_total",
93                "Total number of requests by protocol, method, and status",
94            ),
95            &["protocol", "method", "status"],
96        )
97        .expect("Failed to create requests_total metric");
98
99        let requests_duration_seconds = HistogramVec::new(
100            HistogramOpts::new("mockforge_request_duration_seconds", "Request duration in seconds")
101                .buckets(vec![
102                    0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
103                ]),
104            &["protocol", "method"],
105        )
106        .expect("Failed to create requests_duration_seconds metric");
107
108        let requests_in_flight = IntGaugeVec::new(
109            Opts::new(
110                "mockforge_requests_in_flight",
111                "Number of requests currently being processed",
112            ),
113            &["protocol"],
114        )
115        .expect("Failed to create requests_in_flight metric");
116
117        // Error metrics
118        let errors_total = IntCounterVec::new(
119            Opts::new(
120                "mockforge_errors_total",
121                "Total number of errors by protocol and error type",
122            ),
123            &["protocol", "error_type"],
124        )
125        .expect("Failed to create errors_total metric");
126
127        let error_rate = GaugeVec::new(
128            Opts::new("mockforge_error_rate", "Error rate by protocol (0.0 to 1.0)"),
129            &["protocol"],
130        )
131        .expect("Failed to create error_rate metric");
132
133        // Plugin metrics
134        let plugin_executions_total = IntCounterVec::new(
135            Opts::new("mockforge_plugin_executions_total", "Total number of plugin executions"),
136            &["plugin_name", "status"],
137        )
138        .expect("Failed to create plugin_executions_total metric");
139
140        let plugin_execution_duration_seconds = HistogramVec::new(
141            HistogramOpts::new(
142                "mockforge_plugin_execution_duration_seconds",
143                "Plugin execution duration in seconds",
144            )
145            .buckets(vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]),
146            &["plugin_name"],
147        )
148        .expect("Failed to create plugin_execution_duration_seconds metric");
149
150        let plugin_errors_total = IntCounterVec::new(
151            Opts::new("mockforge_plugin_errors_total", "Total number of plugin errors"),
152            &["plugin_name", "error_type"],
153        )
154        .expect("Failed to create plugin_errors_total metric");
155
156        // WebSocket metrics
157        // Path-based request metrics
158        let requests_by_path_total = IntCounterVec::new(
159            Opts::new(
160                "mockforge_requests_by_path_total",
161                "Total number of requests by path, method, and status",
162            ),
163            &["path", "method", "status"],
164        )
165        .expect("Failed to create requests_by_path_total metric");
166
167        let request_duration_by_path_seconds = HistogramVec::new(
168            HistogramOpts::new(
169                "mockforge_request_duration_by_path_seconds",
170                "Request duration by path in seconds",
171            )
172            .buckets(vec![
173                0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
174            ]),
175            &["path", "method"],
176        )
177        .expect("Failed to create request_duration_by_path_seconds metric");
178
179        let average_latency_by_path_seconds = GaugeVec::new(
180            Opts::new(
181                "mockforge_average_latency_by_path_seconds",
182                "Average request latency by path in seconds",
183            ),
184            &["path", "method"],
185        )
186        .expect("Failed to create average_latency_by_path_seconds metric");
187
188        // Workspace-specific metrics
189        let workspace_requests_total = IntCounterVec::new(
190            Opts::new(
191                "mockforge_workspace_requests_total",
192                "Total number of requests by workspace, method, and status",
193            ),
194            &["workspace_id", "method", "status"],
195        )
196        .expect("Failed to create workspace_requests_total metric");
197
198        let workspace_requests_duration_seconds = HistogramVec::new(
199            HistogramOpts::new(
200                "mockforge_workspace_request_duration_seconds",
201                "Request duration by workspace in seconds",
202            )
203            .buckets(vec![
204                0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
205            ]),
206            &["workspace_id", "method"],
207        )
208        .expect("Failed to create workspace_requests_duration_seconds metric");
209
210        let workspace_active_routes = IntGaugeVec::new(
211            Opts::new(
212                "mockforge_workspace_active_routes",
213                "Number of active routes in each workspace",
214            ),
215            &["workspace_id"],
216        )
217        .expect("Failed to create workspace_active_routes metric");
218
219        let workspace_errors_total = IntCounterVec::new(
220            Opts::new("mockforge_workspace_errors_total", "Total number of errors by workspace"),
221            &["workspace_id", "error_type"],
222        )
223        .expect("Failed to create workspace_errors_total metric");
224
225        // WebSocket metrics
226        let ws_connections_active = IntGauge::new(
227            "mockforge_ws_connections_active",
228            "Number of active WebSocket connections",
229        )
230        .expect("Failed to create ws_connections_active metric");
231
232        let ws_connections_total = IntCounter::new(
233            "mockforge_ws_connections_total",
234            "Total number of WebSocket connections established",
235        )
236        .expect("Failed to create ws_connections_total metric");
237
238        let ws_connection_duration_seconds = HistogramVec::new(
239            HistogramOpts::new(
240                "mockforge_ws_connection_duration_seconds",
241                "WebSocket connection duration in seconds",
242            )
243            .buckets(vec![1.0, 5.0, 10.0, 30.0, 60.0, 300.0, 600.0, 1800.0, 3600.0]),
244            &["status"],
245        )
246        .expect("Failed to create ws_connection_duration_seconds metric");
247
248        let ws_messages_sent = IntCounter::new(
249            "mockforge_ws_messages_sent_total",
250            "Total number of WebSocket messages sent",
251        )
252        .expect("Failed to create ws_messages_sent metric");
253
254        let ws_messages_received = IntCounter::new(
255            "mockforge_ws_messages_received_total",
256            "Total number of WebSocket messages received",
257        )
258        .expect("Failed to create ws_messages_received metric");
259
260        let ws_errors_total =
261            IntCounter::new("mockforge_ws_errors_total", "Total number of WebSocket errors")
262                .expect("Failed to create ws_errors_total metric");
263
264        // SMTP metrics
265        let smtp_connections_active =
266            IntGauge::new("mockforge_smtp_connections_active", "Number of active SMTP connections")
267                .expect("Failed to create smtp_connections_active metric");
268
269        let smtp_connections_total =
270            IntCounter::new("mockforge_smtp_connections_total", "Total number of SMTP connections")
271                .expect("Failed to create smtp_connections_total metric");
272
273        let smtp_messages_received_total = IntCounter::new(
274            "mockforge_smtp_messages_received_total",
275            "Total number of SMTP messages received",
276        )
277        .expect("Failed to create smtp_messages_received_total metric");
278
279        let smtp_messages_stored_total = IntCounter::new(
280            "mockforge_smtp_messages_stored_total",
281            "Total number of SMTP messages stored in mailbox",
282        )
283        .expect("Failed to create smtp_messages_stored_total metric");
284
285        let smtp_errors_total = IntCounterVec::new(
286            Opts::new("mockforge_smtp_errors_total", "Total number of SMTP errors by type"),
287            &["error_type"],
288        )
289        .expect("Failed to create smtp_errors_total metric");
290
291        // MQTT metrics
292        let mqtt_connections_active = IntGauge::new(
293            "mockforge_mqtt_connections_active",
294            "Number of active MQTT client connections",
295        )
296        .expect("Failed to create mqtt_connections_active metric");
297
298        let mqtt_connections_total = IntCounter::new(
299            "mockforge_mqtt_connections_total",
300            "Total number of MQTT client connections established",
301        )
302        .expect("Failed to create mqtt_connections_total metric");
303
304        let mqtt_messages_published_total = IntCounter::new(
305            "mockforge_mqtt_messages_published_total",
306            "Total number of MQTT messages published",
307        )
308        .expect("Failed to create mqtt_messages_published_total metric");
309
310        let mqtt_messages_received_total = IntCounter::new(
311            "mockforge_mqtt_messages_received_total",
312            "Total number of MQTT messages received",
313        )
314        .expect("Failed to create mqtt_messages_received_total metric");
315
316        let mqtt_topics_active =
317            IntGauge::new("mockforge_mqtt_topics_active", "Number of active MQTT topics")
318                .expect("Failed to create mqtt_topics_active metric");
319
320        let mqtt_subscriptions_active = IntGauge::new(
321            "mockforge_mqtt_subscriptions_active",
322            "Number of active MQTT subscriptions",
323        )
324        .expect("Failed to create mqtt_subscriptions_active metric");
325
326        let mqtt_retained_messages =
327            IntGauge::new("mockforge_mqtt_retained_messages", "Number of retained MQTT messages")
328                .expect("Failed to create mqtt_retained_messages metric");
329
330        let mqtt_errors_total = IntCounterVec::new(
331            Opts::new("mockforge_mqtt_errors_total", "Total number of MQTT errors by type"),
332            &["error_type"],
333        )
334        .expect("Failed to create mqtt_errors_total metric");
335
336        // System metrics
337        let memory_usage_bytes =
338            Gauge::new("mockforge_memory_usage_bytes", "Memory usage in bytes")
339                .expect("Failed to create memory_usage_bytes metric");
340
341        let cpu_usage_percent = Gauge::new("mockforge_cpu_usage_percent", "CPU usage percentage")
342            .expect("Failed to create cpu_usage_percent metric");
343
344        let thread_count = Gauge::new("mockforge_thread_count", "Number of active threads")
345            .expect("Failed to create thread_count metric");
346
347        let uptime_seconds = Gauge::new("mockforge_uptime_seconds", "Server uptime in seconds")
348            .expect("Failed to create uptime_seconds metric");
349
350        // Scenario metrics
351        let active_scenario_mode = IntGauge::new(
352            "mockforge_active_scenario_mode",
353            "Active scenario mode (0=healthy, 1=degraded, 2=error, 3=chaos)",
354        )
355        .expect("Failed to create active_scenario_mode metric");
356
357        let chaos_triggers_total = IntCounter::new(
358            "mockforge_chaos_triggers_total",
359            "Total number of chaos mode triggers",
360        )
361        .expect("Failed to create chaos_triggers_total metric");
362
363        // Business/SLO metrics
364        let service_availability = GaugeVec::new(
365            Opts::new(
366                "mockforge_service_availability",
367                "Service availability percentage (0.0 to 1.0) by protocol",
368            ),
369            &["protocol"],
370        )
371        .expect("Failed to create service_availability metric");
372
373        let slo_compliance = GaugeVec::new(
374            Opts::new(
375                "mockforge_slo_compliance",
376                "SLO compliance percentage (0.0 to 1.0) by protocol and slo_type",
377            ),
378            &["protocol", "slo_type"],
379        )
380        .expect("Failed to create slo_compliance metric");
381
382        let successful_request_rate = GaugeVec::new(
383            Opts::new(
384                "mockforge_successful_request_rate",
385                "Successful request rate (0.0 to 1.0) by protocol",
386            ),
387            &["protocol"],
388        )
389        .expect("Failed to create successful_request_rate metric");
390
391        let p95_latency_slo_compliance = GaugeVec::new(
392            Opts::new(
393                "mockforge_p95_latency_slo_compliance",
394                "P95 latency SLO compliance (1.0 = compliant, 0.0 = non-compliant) by protocol",
395            ),
396            &["protocol"],
397        )
398        .expect("Failed to create p95_latency_slo_compliance metric");
399
400        let error_budget_remaining = GaugeVec::new(
401            Opts::new(
402                "mockforge_error_budget_remaining",
403                "Remaining error budget percentage (0.0 to 1.0) by protocol",
404            ),
405            &["protocol"],
406        )
407        .expect("Failed to create error_budget_remaining metric");
408
409        // Register all metrics
410        registry
411            .register(Box::new(requests_total.clone()))
412            .expect("Failed to register requests_total");
413        registry
414            .register(Box::new(requests_duration_seconds.clone()))
415            .expect("Failed to register requests_duration_seconds");
416        registry
417            .register(Box::new(requests_in_flight.clone()))
418            .expect("Failed to register requests_in_flight");
419        registry
420            .register(Box::new(requests_by_path_total.clone()))
421            .expect("Failed to register requests_by_path_total");
422        registry
423            .register(Box::new(request_duration_by_path_seconds.clone()))
424            .expect("Failed to register request_duration_by_path_seconds");
425        registry
426            .register(Box::new(average_latency_by_path_seconds.clone()))
427            .expect("Failed to register average_latency_by_path_seconds");
428        registry
429            .register(Box::new(workspace_requests_total.clone()))
430            .expect("Failed to register workspace_requests_total");
431        registry
432            .register(Box::new(workspace_requests_duration_seconds.clone()))
433            .expect("Failed to register workspace_requests_duration_seconds");
434        registry
435            .register(Box::new(workspace_active_routes.clone()))
436            .expect("Failed to register workspace_active_routes");
437        registry
438            .register(Box::new(workspace_errors_total.clone()))
439            .expect("Failed to register workspace_errors_total");
440        registry
441            .register(Box::new(errors_total.clone()))
442            .expect("Failed to register errors_total");
443        registry
444            .register(Box::new(error_rate.clone()))
445            .expect("Failed to register error_rate");
446        registry
447            .register(Box::new(plugin_executions_total.clone()))
448            .expect("Failed to register plugin_executions_total");
449        registry
450            .register(Box::new(plugin_execution_duration_seconds.clone()))
451            .expect("Failed to register plugin_execution_duration_seconds");
452        registry
453            .register(Box::new(plugin_errors_total.clone()))
454            .expect("Failed to register plugin_errors_total");
455        registry
456            .register(Box::new(ws_connections_active.clone()))
457            .expect("Failed to register ws_connections_active");
458        registry
459            .register(Box::new(ws_connections_total.clone()))
460            .expect("Failed to register ws_connections_total");
461        registry
462            .register(Box::new(ws_connection_duration_seconds.clone()))
463            .expect("Failed to register ws_connection_duration_seconds");
464        registry
465            .register(Box::new(ws_messages_sent.clone()))
466            .expect("Failed to register ws_messages_sent");
467        registry
468            .register(Box::new(ws_messages_received.clone()))
469            .expect("Failed to register ws_messages_received");
470        registry
471            .register(Box::new(ws_errors_total.clone()))
472            .expect("Failed to register ws_errors_total");
473        registry
474            .register(Box::new(smtp_connections_active.clone()))
475            .expect("Failed to register smtp_connections_active");
476        registry
477            .register(Box::new(smtp_connections_total.clone()))
478            .expect("Failed to register smtp_connections_total");
479        registry
480            .register(Box::new(smtp_messages_received_total.clone()))
481            .expect("Failed to register smtp_messages_received_total");
482        registry
483            .register(Box::new(smtp_messages_stored_total.clone()))
484            .expect("Failed to register smtp_messages_stored_total");
485        registry
486            .register(Box::new(smtp_errors_total.clone()))
487            .expect("Failed to register smtp_errors_total");
488        registry
489            .register(Box::new(mqtt_connections_active.clone()))
490            .expect("Failed to register mqtt_connections_active");
491        registry
492            .register(Box::new(mqtt_connections_total.clone()))
493            .expect("Failed to register mqtt_connections_total");
494        registry
495            .register(Box::new(mqtt_messages_published_total.clone()))
496            .expect("Failed to register mqtt_messages_published_total");
497        registry
498            .register(Box::new(mqtt_messages_received_total.clone()))
499            .expect("Failed to register mqtt_messages_received_total");
500        registry
501            .register(Box::new(mqtt_topics_active.clone()))
502            .expect("Failed to register mqtt_topics_active");
503        registry
504            .register(Box::new(mqtt_subscriptions_active.clone()))
505            .expect("Failed to register mqtt_subscriptions_active");
506        registry
507            .register(Box::new(mqtt_retained_messages.clone()))
508            .expect("Failed to register mqtt_retained_messages");
509        registry
510            .register(Box::new(mqtt_errors_total.clone()))
511            .expect("Failed to register mqtt_errors_total");
512        registry
513            .register(Box::new(memory_usage_bytes.clone()))
514            .expect("Failed to register memory_usage_bytes");
515        registry
516            .register(Box::new(cpu_usage_percent.clone()))
517            .expect("Failed to register cpu_usage_percent");
518        registry
519            .register(Box::new(thread_count.clone()))
520            .expect("Failed to register thread_count");
521        registry
522            .register(Box::new(uptime_seconds.clone()))
523            .expect("Failed to register uptime_seconds");
524        registry
525            .register(Box::new(active_scenario_mode.clone()))
526            .expect("Failed to register active_scenario_mode");
527        registry
528            .register(Box::new(chaos_triggers_total.clone()))
529            .expect("Failed to register chaos_triggers_total");
530        registry
531            .register(Box::new(service_availability.clone()))
532            .expect("Failed to register service_availability");
533        registry
534            .register(Box::new(slo_compliance.clone()))
535            .expect("Failed to register slo_compliance");
536        registry
537            .register(Box::new(successful_request_rate.clone()))
538            .expect("Failed to register successful_request_rate");
539        registry
540            .register(Box::new(p95_latency_slo_compliance.clone()))
541            .expect("Failed to register p95_latency_slo_compliance");
542        registry
543            .register(Box::new(error_budget_remaining.clone()))
544            .expect("Failed to register error_budget_remaining");
545
546        debug!("Initialized Prometheus metrics registry");
547
548        Self {
549            registry: Arc::new(registry),
550            requests_total,
551            requests_duration_seconds,
552            requests_in_flight,
553            requests_by_path_total,
554            request_duration_by_path_seconds,
555            average_latency_by_path_seconds,
556            workspace_requests_total,
557            workspace_requests_duration_seconds,
558            workspace_active_routes,
559            workspace_errors_total,
560            errors_total,
561            error_rate,
562            plugin_executions_total,
563            plugin_execution_duration_seconds,
564            plugin_errors_total,
565            ws_connections_active,
566            ws_connections_total,
567            ws_connection_duration_seconds,
568            ws_messages_sent,
569            ws_messages_received,
570            ws_errors_total,
571            smtp_connections_active,
572            smtp_connections_total,
573            smtp_messages_received_total,
574            smtp_messages_stored_total,
575            smtp_errors_total,
576            mqtt_connections_active,
577            mqtt_connections_total,
578            mqtt_messages_published_total,
579            mqtt_messages_received_total,
580            mqtt_topics_active,
581            mqtt_subscriptions_active,
582            mqtt_retained_messages,
583            mqtt_errors_total,
584            memory_usage_bytes,
585            cpu_usage_percent,
586            thread_count,
587            uptime_seconds,
588            active_scenario_mode,
589            chaos_triggers_total,
590            service_availability,
591            slo_compliance,
592            successful_request_rate,
593            p95_latency_slo_compliance,
594            error_budget_remaining,
595        }
596    }
597
598    /// Get the underlying Prometheus registry
599    pub fn registry(&self) -> &Registry {
600        &self.registry
601    }
602
603    /// Check if the registry is initialized
604    pub fn is_initialized(&self) -> bool {
605        true
606    }
607
608    /// Record an HTTP request
609    pub fn record_http_request(&self, method: &str, status: u16, duration_seconds: f64) {
610        let status_str = status.to_string();
611        self.requests_total.with_label_values(&["http", method, &status_str]).inc();
612        self.requests_duration_seconds
613            .with_label_values(&["http", method])
614            .observe(duration_seconds);
615    }
616
617    /// Record a gRPC request
618    pub fn record_grpc_request(&self, method: &str, status: &str, duration_seconds: f64) {
619        self.requests_total.with_label_values(&["grpc", method, status]).inc();
620        self.requests_duration_seconds
621            .with_label_values(&["grpc", method])
622            .observe(duration_seconds);
623    }
624
625    /// Record a WebSocket message
626    pub fn record_ws_message_sent(&self) {
627        self.ws_messages_sent.inc();
628    }
629
630    /// Record a WebSocket message received
631    pub fn record_ws_message_received(&self) {
632        self.ws_messages_received.inc();
633    }
634
635    /// Record a GraphQL request
636    pub fn record_graphql_request(&self, operation: &str, status: u16, duration_seconds: f64) {
637        let status_str = status.to_string();
638        self.requests_total
639            .with_label_values(&["graphql", operation, &status_str])
640            .inc();
641        self.requests_duration_seconds
642            .with_label_values(&["graphql", operation])
643            .observe(duration_seconds);
644    }
645
646    /// Record a plugin execution
647    pub fn record_plugin_execution(&self, plugin_name: &str, success: bool, duration_seconds: f64) {
648        let status = if success { "success" } else { "failure" };
649        self.plugin_executions_total.with_label_values(&[plugin_name, status]).inc();
650        self.plugin_execution_duration_seconds
651            .with_label_values(&[plugin_name])
652            .observe(duration_seconds);
653    }
654
655    /// Increment in-flight requests
656    pub fn increment_in_flight(&self, protocol: &str) {
657        self.requests_in_flight.with_label_values(&[protocol]).inc();
658    }
659
660    /// Decrement in-flight requests
661    pub fn decrement_in_flight(&self, protocol: &str) {
662        self.requests_in_flight.with_label_values(&[protocol]).dec();
663    }
664
665    /// Record an error
666    pub fn record_error(&self, protocol: &str, error_type: &str) {
667        self.errors_total.with_label_values(&[protocol, error_type]).inc();
668    }
669
670    /// Update memory usage
671    pub fn update_memory_usage(&self, bytes: f64) {
672        self.memory_usage_bytes.set(bytes);
673    }
674
675    /// Update CPU usage
676    pub fn update_cpu_usage(&self, percent: f64) {
677        self.cpu_usage_percent.set(percent);
678    }
679
680    /// Set active scenario mode (0=healthy, 1=degraded, 2=error, 3=chaos)
681    pub fn set_scenario_mode(&self, mode: i64) {
682        self.active_scenario_mode.set(mode);
683    }
684
685    /// Record a chaos trigger
686    pub fn record_chaos_trigger(&self) {
687        self.chaos_triggers_total.inc();
688    }
689
690    /// Record an HTTP request with path information
691    pub fn record_http_request_with_path(
692        &self,
693        path: &str,
694        method: &str,
695        status: u16,
696        duration_seconds: f64,
697    ) {
698        // Normalize path to avoid cardinality explosion
699        let normalized_path = normalize_path(path);
700        let status_str = status.to_string();
701
702        // Record by path
703        self.requests_by_path_total
704            .with_label_values(&[normalized_path.as_str(), method, status_str.as_str()])
705            .inc();
706        self.request_duration_by_path_seconds
707            .with_label_values(&[normalized_path.as_str(), method])
708            .observe(duration_seconds);
709
710        // Update average latency (simple moving average approximation)
711        // Note: For production use, consider using a proper moving average or quantiles
712        let current = self
713            .average_latency_by_path_seconds
714            .with_label_values(&[normalized_path.as_str(), method])
715            .get();
716        let new_avg = if current == 0.0 {
717            duration_seconds
718        } else {
719            (current * 0.95) + (duration_seconds * 0.05)
720        };
721        self.average_latency_by_path_seconds
722            .with_label_values(&[normalized_path.as_str(), method])
723            .set(new_avg);
724
725        // Also record in the general metrics
726        self.record_http_request(method, status, duration_seconds);
727    }
728
729    /// Record a WebSocket connection established
730    pub fn record_ws_connection_established(&self) {
731        self.ws_connections_total.inc();
732        self.ws_connections_active.inc();
733    }
734
735    /// Record a WebSocket connection closed
736    pub fn record_ws_connection_closed(&self, duration_seconds: f64, status: &str) {
737        self.ws_connections_active.dec();
738        self.ws_connection_duration_seconds
739            .with_label_values(&[status])
740            .observe(duration_seconds);
741    }
742
743    /// Record a WebSocket error
744    pub fn record_ws_error(&self) {
745        self.ws_errors_total.inc();
746    }
747
748    /// Record an SMTP connection established
749    pub fn record_smtp_connection_established(&self) {
750        self.smtp_connections_total.inc();
751        self.smtp_connections_active.inc();
752    }
753
754    /// Record an SMTP connection closed
755    pub fn record_smtp_connection_closed(&self) {
756        self.smtp_connections_active.dec();
757    }
758
759    /// Record an SMTP message received
760    pub fn record_smtp_message_received(&self) {
761        self.smtp_messages_received_total.inc();
762    }
763
764    /// Record an SMTP message stored
765    pub fn record_smtp_message_stored(&self) {
766        self.smtp_messages_stored_total.inc();
767    }
768
769    /// Record an SMTP error
770    pub fn record_smtp_error(&self, error_type: &str) {
771        self.smtp_errors_total.with_label_values(&[error_type]).inc();
772    }
773
774    /// Update thread count
775    pub fn update_thread_count(&self, count: f64) {
776        self.thread_count.set(count);
777    }
778
779    /// Update uptime
780    pub fn update_uptime(&self, seconds: f64) {
781        self.uptime_seconds.set(seconds);
782    }
783
784    // ==================== Workspace-specific metrics ====================
785
786    /// Record a workspace request
787    pub fn record_workspace_request(
788        &self,
789        workspace_id: &str,
790        method: &str,
791        status: u16,
792        duration_seconds: f64,
793    ) {
794        let status_str = status.to_string();
795        self.workspace_requests_total
796            .with_label_values(&[workspace_id, method, &status_str])
797            .inc();
798        self.workspace_requests_duration_seconds
799            .with_label_values(&[workspace_id, method])
800            .observe(duration_seconds);
801    }
802
803    /// Update workspace active routes count
804    pub fn update_workspace_active_routes(&self, workspace_id: &str, count: i64) {
805        self.workspace_active_routes.with_label_values(&[workspace_id]).set(count);
806    }
807
808    /// Record a workspace error
809    pub fn record_workspace_error(&self, workspace_id: &str, error_type: &str) {
810        self.workspace_errors_total.with_label_values(&[workspace_id, error_type]).inc();
811    }
812
813    /// Increment workspace active routes
814    pub fn increment_workspace_routes(&self, workspace_id: &str) {
815        self.workspace_active_routes.with_label_values(&[workspace_id]).inc();
816    }
817
818    /// Decrement workspace active routes
819    pub fn decrement_workspace_routes(&self, workspace_id: &str) {
820        self.workspace_active_routes.with_label_values(&[workspace_id]).dec();
821    }
822}
823
824/// Normalize path to avoid high cardinality
825///
826/// This function replaces dynamic path segments (IDs, UUIDs, etc.) with placeholders
827/// to prevent metric explosion.
828fn normalize_path(path: &str) -> String {
829    let mut segments: Vec<&str> = path.split('/').collect();
830
831    for segment in &mut segments {
832        // Replace UUIDs, numeric IDs, or hex strings with :id placeholder
833        if is_uuid(segment)
834            || segment.parse::<i64>().is_ok()
835            || (segment.len() > 8 && segment.chars().all(|c| c.is_ascii_hexdigit()))
836        {
837            *segment = ":id";
838        }
839    }
840
841    segments.join("/")
842}
843
844/// Check if a string is a UUID
845fn is_uuid(s: &str) -> bool {
846    s.len() == 36 && s.chars().filter(|&c| c == '-').count() == 4
847}
848
849impl Default for MetricsRegistry {
850    fn default() -> Self {
851        Self::new()
852    }
853}
854
855/// Global metrics registry instance
856static GLOBAL_REGISTRY: Lazy<MetricsRegistry> = Lazy::new(MetricsRegistry::new);
857
858/// Get the global metrics registry
859pub fn get_global_registry() -> &'static MetricsRegistry {
860    &GLOBAL_REGISTRY
861}
862
863#[cfg(test)]
864mod tests {
865    use super::*;
866
867    #[test]
868    fn test_metrics_registry_creation() {
869        let registry = MetricsRegistry::new();
870        assert!(registry.is_initialized());
871    }
872
873    #[test]
874    fn test_record_http_request() {
875        let registry = MetricsRegistry::new();
876        registry.record_http_request("GET", 200, 0.045);
877        registry.record_http_request("POST", 201, 0.123);
878
879        // Verify metrics were recorded (they should not panic)
880        assert!(registry.is_initialized());
881    }
882
883    #[test]
884    fn test_global_registry() {
885        let registry = get_global_registry();
886        assert!(registry.is_initialized());
887    }
888
889    #[test]
890    fn test_plugin_metrics() {
891        let registry = MetricsRegistry::new();
892        registry.record_plugin_execution("test-plugin", true, 0.025);
893        registry.record_plugin_execution("test-plugin", false, 0.050);
894        assert!(registry.is_initialized());
895    }
896
897    #[test]
898    fn test_websocket_metrics() {
899        let registry = MetricsRegistry::new();
900        registry.record_ws_message_sent();
901        registry.record_ws_message_received();
902        registry.record_ws_connection_established();
903        registry.record_ws_connection_closed(120.5, "normal");
904        registry.record_ws_error();
905        assert!(registry.is_initialized());
906    }
907
908    #[test]
909    fn test_path_normalization() {
910        assert_eq!(normalize_path("/api/users/123"), "/api/users/:id");
911        assert_eq!(
912            normalize_path("/api/users/550e8400-e29b-41d4-a716-446655440000"),
913            "/api/users/:id"
914        );
915        assert_eq!(normalize_path("/api/users/abc123def456"), "/api/users/:id");
916        assert_eq!(normalize_path("/api/users/list"), "/api/users/list");
917    }
918
919    #[test]
920    fn test_path_based_metrics() {
921        let registry = MetricsRegistry::new();
922        registry.record_http_request_with_path("/api/users/123", "GET", 200, 0.045);
923        registry.record_http_request_with_path("/api/users/456", "GET", 200, 0.055);
924        registry.record_http_request_with_path("/api/posts", "POST", 201, 0.123);
925        assert!(registry.is_initialized());
926    }
927
928    #[test]
929    fn test_smtp_metrics() {
930        let registry = MetricsRegistry::new();
931        registry.record_smtp_connection_established();
932        registry.record_smtp_message_received();
933        registry.record_smtp_message_stored();
934        registry.record_smtp_connection_closed();
935        registry.record_smtp_error("timeout");
936        assert!(registry.is_initialized());
937    }
938
939    #[test]
940    fn test_system_metrics() {
941        let registry = MetricsRegistry::new();
942        registry.update_memory_usage(1024.0 * 1024.0 * 100.0); // 100 MB
943        registry.update_cpu_usage(45.5);
944        registry.update_thread_count(25.0);
945        registry.update_uptime(3600.0); // 1 hour
946        assert!(registry.is_initialized());
947    }
948
949    #[test]
950    fn test_workspace_metrics() {
951        let registry = MetricsRegistry::new();
952
953        // Record workspace requests
954        registry.record_workspace_request("workspace1", "GET", 200, 0.045);
955        registry.record_workspace_request("workspace1", "POST", 201, 0.123);
956        registry.record_workspace_request("workspace2", "GET", 200, 0.055);
957
958        // Update active routes
959        registry.update_workspace_active_routes("workspace1", 10);
960        registry.update_workspace_active_routes("workspace2", 5);
961
962        // Record errors
963        registry.record_workspace_error("workspace1", "validation");
964        registry.record_workspace_error("workspace2", "timeout");
965
966        // Test increment/decrement
967        registry.increment_workspace_routes("workspace1");
968        registry.decrement_workspace_routes("workspace1");
969
970        assert!(registry.is_initialized());
971    }
972
973    #[test]
974    fn test_workspace_metrics_isolation() {
975        let registry = MetricsRegistry::new();
976
977        // Ensure metrics for different workspaces are independent
978        registry.record_workspace_request("ws1", "GET", 200, 0.1);
979        registry.record_workspace_request("ws2", "GET", 200, 0.2);
980
981        registry.update_workspace_active_routes("ws1", 5);
982        registry.update_workspace_active_routes("ws2", 10);
983
984        // Both should be tracked independently
985        assert!(registry.is_initialized());
986    }
987}