mockforge_chaos/
metrics.rs

1//! Prometheus metrics for chaos engineering
2//!
3//! Provides real-time metrics that can be integrated with Grafana
4//! for monitoring chaos orchestrations, scenarios, and system impact.
5
6use once_cell::sync::Lazy;
7use prometheus::{
8    register_counter_vec, register_gauge_vec, register_histogram_vec, CounterVec, GaugeVec,
9    HistogramVec, Registry,
10};
11
12/// Chaos orchestration metrics
13pub struct ChaosMetrics {
14    /// Number of scenarios executed
15    pub scenarios_total: CounterVec,
16
17    /// Number of faults injected
18    pub faults_injected_total: CounterVec,
19
20    /// Latency injected (histogram)
21    pub latency_injected: HistogramVec,
22
23    /// Rate limit violations
24    pub rate_limit_violations_total: CounterVec,
25
26    /// Circuit breaker state
27    pub circuit_breaker_state: GaugeVec,
28
29    /// Bulkhead concurrent requests
30    pub bulkhead_concurrent: GaugeVec,
31
32    /// Orchestration step duration
33    pub orchestration_step_duration: HistogramVec,
34
35    /// Orchestration execution status
36    pub orchestration_executions_total: CounterVec,
37
38    /// Active orchestrations
39    pub active_orchestrations: GaugeVec,
40
41    /// Assertion results
42    pub assertion_results_total: CounterVec,
43
44    /// Hook executions
45    pub hook_executions_total: CounterVec,
46
47    /// Recommendation count
48    pub recommendations_total: GaugeVec,
49
50    /// System impact score
51    pub chaos_impact_score: GaugeVec,
52}
53
54impl ChaosMetrics {
55    /// Create new metrics
56    pub fn new() -> Result<Self, prometheus::Error> {
57        Ok(Self {
58            scenarios_total: register_counter_vec!(
59                "mockforge_chaos_scenarios_total",
60                "Total number of chaos scenarios executed",
61                &["scenario_type", "status"]
62            )?,
63
64            faults_injected_total: register_counter_vec!(
65                "mockforge_chaos_faults_total",
66                "Total number of faults injected",
67                &["fault_type", "endpoint"]
68            )?,
69
70            latency_injected: register_histogram_vec!(
71                "mockforge_chaos_latency_ms",
72                "Latency injected in milliseconds",
73                &["endpoint"],
74                vec![10.0, 50.0, 100.0, 250.0, 500.0, 1000.0, 2500.0, 5000.0, 10000.0]
75            )?,
76
77            rate_limit_violations_total: register_counter_vec!(
78                "mockforge_chaos_rate_limit_violations_total",
79                "Total rate limit violations",
80                &["endpoint"]
81            )?,
82
83            circuit_breaker_state: register_gauge_vec!(
84                "mockforge_chaos_circuit_breaker_state",
85                "Circuit breaker state (0=closed, 1=open, 2=half-open)",
86                &["circuit_name"]
87            )?,
88
89            bulkhead_concurrent: register_gauge_vec!(
90                "mockforge_chaos_bulkhead_concurrent_requests",
91                "Current concurrent requests in bulkhead",
92                &["bulkhead_name"]
93            )?,
94
95            orchestration_step_duration: register_histogram_vec!(
96                "mockforge_chaos_orchestration_step_duration_seconds",
97                "Duration of orchestration steps in seconds",
98                &["orchestration", "step"],
99                vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]
100            )?,
101
102            orchestration_executions_total: register_counter_vec!(
103                "mockforge_chaos_orchestration_executions_total",
104                "Total orchestration executions",
105                &["orchestration", "status"]
106            )?,
107
108            active_orchestrations: register_gauge_vec!(
109                "mockforge_chaos_active_orchestrations",
110                "Number of active orchestrations",
111                &["orchestration"]
112            )?,
113
114            assertion_results_total: register_counter_vec!(
115                "mockforge_chaos_assertion_results_total",
116                "Total assertion results",
117                &["orchestration", "result"]
118            )?,
119
120            hook_executions_total: register_counter_vec!(
121                "mockforge_chaos_hook_executions_total",
122                "Total hook executions",
123                &["hook_type", "status"]
124            )?,
125
126            recommendations_total: register_gauge_vec!(
127                "mockforge_chaos_recommendations_total",
128                "Number of AI recommendations",
129                &["category", "severity"]
130            )?,
131
132            chaos_impact_score: register_gauge_vec!(
133                "mockforge_chaos_impact_score",
134                "Overall chaos impact score (0.0-1.0)",
135                &["time_window"]
136            )?,
137        })
138    }
139
140    /// Record scenario execution
141    pub fn record_scenario(&self, scenario_type: &str, success: bool) {
142        self.scenarios_total
143            .with_label_values(&[scenario_type, if success { "success" } else { "failure" }])
144            .inc();
145    }
146
147    /// Record fault injection
148    pub fn record_fault(&self, fault_type: &str, endpoint: &str) {
149        self.faults_injected_total.with_label_values(&[fault_type, endpoint]).inc();
150    }
151
152    /// Record latency injection
153    pub fn record_latency(&self, endpoint: &str, latency_ms: f64) {
154        self.latency_injected.with_label_values(&[endpoint]).observe(latency_ms);
155    }
156
157    /// Record rate limit violation
158    pub fn record_rate_limit_violation(&self, endpoint: &str) {
159        self.rate_limit_violations_total.with_label_values(&[endpoint]).inc();
160    }
161
162    /// Update circuit breaker state
163    pub fn update_circuit_breaker_state(&self, circuit_name: &str, state: f64) {
164        self.circuit_breaker_state.with_label_values(&[circuit_name]).set(state);
165    }
166
167    /// Update bulkhead concurrent requests
168    pub fn update_bulkhead_concurrent(&self, bulkhead_name: &str, count: f64) {
169        self.bulkhead_concurrent.with_label_values(&[bulkhead_name]).set(count);
170    }
171
172    /// Record orchestration step duration
173    pub fn record_step_duration(&self, orchestration: &str, step: &str, duration_secs: f64) {
174        self.orchestration_step_duration
175            .with_label_values(&[orchestration, step])
176            .observe(duration_secs);
177    }
178
179    /// Record orchestration execution
180    pub fn record_orchestration_execution(&self, orchestration: &str, success: bool) {
181        self.orchestration_executions_total
182            .with_label_values(&[orchestration, if success { "success" } else { "failure" }])
183            .inc();
184    }
185
186    /// Update active orchestrations
187    pub fn update_active_orchestrations(&self, orchestration: &str, active: bool) {
188        if active {
189            self.active_orchestrations.with_label_values(&[orchestration]).inc();
190        } else {
191            self.active_orchestrations.with_label_values(&[orchestration]).dec();
192        }
193    }
194
195    /// Record assertion result
196    pub fn record_assertion(&self, orchestration: &str, passed: bool) {
197        self.assertion_results_total
198            .with_label_values(&[orchestration, if passed { "passed" } else { "failed" }])
199            .inc();
200    }
201
202    /// Record hook execution
203    pub fn record_hook(&self, hook_type: &str, success: bool) {
204        self.hook_executions_total
205            .with_label_values(&[hook_type, if success { "success" } else { "failure" }])
206            .inc();
207    }
208
209    /// Update recommendations count
210    pub fn update_recommendations(&self, category: &str, severity: &str, count: f64) {
211        self.recommendations_total.with_label_values(&[category, severity]).set(count);
212    }
213
214    /// Update chaos impact score
215    pub fn update_impact_score(&self, time_window: &str, score: f64) {
216        self.chaos_impact_score.with_label_values(&[time_window]).set(score);
217    }
218}
219
220impl Default for ChaosMetrics {
221    fn default() -> Self {
222        Self::new().expect("Failed to create chaos metrics")
223    }
224}
225
226/// Global metrics instance
227pub static CHAOS_METRICS: Lazy<ChaosMetrics> =
228    Lazy::new(|| ChaosMetrics::new().expect("Failed to initialize chaos metrics"));
229
230/// Get the default Prometheus registry
231pub fn registry() -> &'static Registry {
232    prometheus::default_registry()
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238
239    #[test]
240    fn test_metrics_creation() {
241        // The global CHAOS_METRICS is already initialized, proving that metrics creation works.
242        // Creating a second instance would fail with "AlreadyReg" because metrics are
243        // registered with the global Prometheus registry.
244        // Instead, verify the global instance is accessible.
245        let _metrics = &*CHAOS_METRICS;
246        // If we get here without panic, the metrics were successfully created
247    }
248
249    #[test]
250    fn test_record_scenario() {
251        let metrics = CHAOS_METRICS.scenarios_total.clone();
252        let before = metrics.with_label_values(&["test", "success"]).get();
253
254        CHAOS_METRICS.record_scenario("test", true);
255
256        let after = metrics.with_label_values(&["test", "success"]).get();
257        assert!(after > before);
258    }
259
260    #[test]
261    fn test_record_latency() {
262        CHAOS_METRICS.record_latency("/api/test", 100.0);
263        // Just ensure it doesn't panic
264    }
265}