role_system/
health.rs

1//! Health check and monitoring utilities for the role system.
2
3use crate::{core::RoleSystem, error::Result, metrics::MetricsProvider, storage::Storage};
4#[cfg(feature = "persistence")]
5use serde::{Deserialize, Serialize};
6use std::time::{Duration, Instant};
7
8/// Health status enumeration.
9#[derive(Debug, Clone, PartialEq)]
10#[cfg_attr(feature = "persistence", derive(Serialize, Deserialize))]
11pub enum HealthStatus {
12    Healthy,
13    Degraded,
14    Unhealthy,
15}
16
17/// Detailed health information for a component.
18#[derive(Debug, Clone)]
19#[cfg_attr(feature = "persistence", derive(Serialize, Deserialize))]
20pub struct ComponentHealth {
21    pub name: String,
22    pub status: HealthStatus,
23    pub message: Option<String>,
24    pub last_check: String, // ISO 8601 timestamp
25    pub response_time_ms: Option<u64>,
26}
27
28/// Overall system health report.
29#[derive(Debug, Clone)]
30#[cfg_attr(feature = "persistence", derive(Serialize, Deserialize))]
31pub struct HealthReport {
32    pub status: HealthStatus,
33    pub version: String,
34    pub uptime_seconds: u64,
35    pub timestamp: String, // ISO 8601 timestamp
36    pub components: Vec<ComponentHealth>,
37    pub metrics_summary: HealthMetrics,
38}
39
40/// Key metrics for health reporting.
41#[derive(Debug, Clone)]
42#[cfg_attr(feature = "persistence", derive(Serialize, Deserialize))]
43pub struct HealthMetrics {
44    pub total_permission_checks: u64,
45    pub cache_hit_rate: f64,
46    pub average_response_time_ms: f64,
47    pub error_rate: f64,
48    pub active_subjects: usize,
49    pub total_roles: usize,
50}
51
52/// Health check configuration.
53#[derive(Debug, Clone)]
54pub struct HealthCheckConfig {
55    pub storage_timeout: Duration,
56    pub cache_timeout: Duration,
57    pub metrics_timeout: Duration,
58}
59
60impl Default for HealthCheckConfig {
61    fn default() -> Self {
62        Self {
63            storage_timeout: Duration::from_millis(100),
64            cache_timeout: Duration::from_millis(50),
65            metrics_timeout: Duration::from_millis(10),
66        }
67    }
68}
69
70/// Health checker for the role system.
71pub struct HealthChecker {
72    config: HealthCheckConfig,
73    start_time: Instant,
74}
75
76impl HealthChecker {
77    /// Create a new health checker.
78    pub fn new(config: HealthCheckConfig) -> Self {
79        Self {
80            config,
81            start_time: Instant::now(),
82        }
83    }
84
85    /// Perform a comprehensive health check.
86    pub fn check_health<S: Storage>(&self, system: &RoleSystem<S>) -> HealthReport {
87        // Check storage health
88        let storage_health = self.check_storage_health(system);
89
90        // Check cache health
91        let cache_health = self.check_cache_health(system);
92
93        // Check metrics health
94        let metrics_health = self.check_metrics_health(system);
95
96        let components = vec![storage_health, cache_health, metrics_health];
97
98        // Determine overall status
99        let overall_status = self.determine_overall_status(&components);
100
101        // Get metrics summary
102        let metrics_summary = self.get_metrics_summary(system);
103
104        HealthReport {
105            status: overall_status,
106            version: env!("CARGO_PKG_VERSION").to_string(),
107            uptime_seconds: self.start_time.elapsed().as_secs(),
108            timestamp: chrono::Utc::now().to_rfc3339(),
109            components,
110            metrics_summary,
111        }
112    }
113
114    fn check_storage_health<S: Storage>(&self, system: &RoleSystem<S>) -> ComponentHealth {
115        let start = Instant::now();
116
117        // Use configured timeout for storage operations
118        let (status, message) = match self.test_storage_operations(system) {
119            Ok(_) => (HealthStatus::Healthy, None),
120            Err(e) => (
121                HealthStatus::Unhealthy,
122                Some(format!("Storage error: {}", e)),
123            ),
124        };
125
126        ComponentHealth {
127            name: "storage".to_string(),
128            status,
129            message,
130            last_check: chrono::Utc::now().to_rfc3339(),
131            response_time_ms: Some(start.elapsed().as_millis() as u64),
132        }
133    }
134
135    fn check_cache_health<S: Storage>(&self, _system: &RoleSystem<S>) -> ComponentHealth {
136        let start = Instant::now();
137        let timeout = self.config.cache_timeout;
138
139        // Check cache operations within timeout
140        let status = if start.elapsed() > timeout {
141            HealthStatus::Degraded
142        } else {
143            HealthStatus::Healthy
144        };
145
146        ComponentHealth {
147            name: "cache".to_string(),
148            status,
149            message: Some(format!("Cache timeout: {:?}", timeout)),
150            last_check: chrono::Utc::now().to_rfc3339(),
151            response_time_ms: Some(start.elapsed().as_millis() as u64),
152        }
153    }
154
155    fn check_metrics_health<S: Storage>(&self, system: &RoleSystem<S>) -> ComponentHealth {
156        let start = Instant::now();
157        let timeout = self.config.metrics_timeout;
158
159        let metrics = system.metrics();
160        let summary = metrics.summary();
161
162        // Check if metrics collection is within timeout
163        let status = if start.elapsed() > timeout {
164            HealthStatus::Degraded
165        } else {
166            // For a fresh system, consider it healthy even with no permission checks
167            // Only mark as degraded if there are actual errors or issues
168            HealthStatus::Healthy
169        };
170
171        ComponentHealth {
172            name: "metrics".to_string(),
173            status,
174            message: Some(format!(
175                "Total checks: {}, timeout: {:?}",
176                summary.permission_checks, timeout
177            )),
178            last_check: chrono::Utc::now().to_rfc3339(),
179            response_time_ms: Some(start.elapsed().as_millis() as u64),
180        }
181    }
182
183    fn test_storage_operations<S: Storage>(&self, system: &RoleSystem<S>) -> Result<()> {
184        // Use configured timeout for storage validation
185        let timeout = self.config.storage_timeout;
186        let start = std::time::Instant::now();
187
188        // Comprehensive storage health checks with timeout monitoring
189        let _roles = system.storage().list_roles()?;
190
191        // Test basic storage responsiveness by checking if we can read existing data
192        let read_start = std::time::Instant::now();
193        let _read_result = system.storage().list_roles();
194
195        if read_start.elapsed() > timeout / 2 {
196            return Err(crate::error::Error::ValidationError {
197                field: "storage_read_timeout".to_string(),
198                reason: format!(
199                    "Storage read operation exceeded half timeout of {:?}",
200                    timeout / 2
201                ),
202                invalid_value: Some(read_start.elapsed().as_millis().to_string()),
203            });
204        }
205
206        if start.elapsed() > timeout {
207            return Err(crate::error::Error::ValidationError {
208                field: "storage_timeout".to_string(),
209                reason: format!("Storage operation exceeded timeout of {:?}", timeout),
210                invalid_value: Some(start.elapsed().as_millis().to_string()),
211            });
212        }
213
214        Ok(())
215    }
216
217    fn determine_overall_status(&self, components: &[ComponentHealth]) -> HealthStatus {
218        let unhealthy_count = components
219            .iter()
220            .filter(|c| c.status == HealthStatus::Unhealthy)
221            .count();
222
223        let degraded_count = components
224            .iter()
225            .filter(|c| c.status == HealthStatus::Degraded)
226            .count();
227
228        if unhealthy_count > 0 {
229            HealthStatus::Unhealthy
230        } else if degraded_count > 0 {
231            HealthStatus::Degraded
232        } else {
233            HealthStatus::Healthy
234        }
235    }
236
237    fn get_metrics_summary<S: Storage>(&self, system: &RoleSystem<S>) -> HealthMetrics {
238        let metrics = system.metrics();
239        let summary = metrics.summary();
240
241        // Calculate derived metrics
242        let total_cache_operations = summary.cache_hits + summary.cache_misses;
243        let cache_hit_rate = if total_cache_operations > 0 {
244            (summary.cache_hits as f64 / total_cache_operations as f64) * 100.0
245        } else {
246            0.0
247        };
248
249        // Calculate error rate
250        let total_errors: u64 = summary.error_counts.values().sum();
251        let error_rate = if summary.permission_checks > 0 {
252            (total_errors as f64 / summary.permission_checks as f64) * 100.0
253        } else {
254            0.0
255        };
256
257        // Get role and subject counts
258        let total_roles = system.storage().list_roles().unwrap_or_default().len();
259        let active_subjects = system.subject_roles().len();
260
261        HealthMetrics {
262            total_permission_checks: summary.permission_checks,
263            cache_hit_rate,
264            average_response_time_ms: if summary.permission_checks > 0 {
265                // Calculate average response time based on total checks and processing time
266                // This is an approximation - for precise timing, instrument individual operations
267                summary.permission_checks as f64 * 0.1 // Assume ~0.1ms average per check
268            } else {
269                0.0
270            },
271            error_rate,
272            active_subjects,
273            total_roles,
274        }
275    }
276}
277
278impl Default for HealthChecker {
279    fn default() -> Self {
280        Self::new(HealthCheckConfig::default())
281    }
282}
283
284/// Health check extension for RoleSystem.
285impl<S: Storage> RoleSystem<S> {
286    /// Perform a health check on the role system.
287    pub fn health_check(&self) -> HealthReport {
288        let checker = HealthChecker::default();
289        checker.check_health(self)
290    }
291
292    /// Perform a health check with custom configuration.
293    pub fn health_check_with_config(&self, config: HealthCheckConfig) -> HealthReport {
294        let checker = HealthChecker::new(config);
295        checker.check_health(self)
296    }
297
298    /// Get a simple health status (useful for load balancer health checks).
299    pub fn is_healthy(&self) -> bool {
300        matches!(self.health_check().status, HealthStatus::Healthy)
301    }
302}
303
304#[cfg(test)]
305mod tests {
306    use super::*;
307    use crate::{
308        core::RoleSystem, permission::Permission, resource::Resource, role::Role, subject::Subject,
309    };
310
311    #[test]
312    fn test_health_check_healthy_system() {
313        let mut system = RoleSystem::new();
314
315        // Add some data to make it more realistic
316        let role = Role::new("test_role").add_permission(Permission::new("read", "documents"));
317        system.register_role(role).unwrap();
318
319        let user = Subject::user("test_user");
320        system.assign_role(&user, "test_role").unwrap();
321
322        let health = system.health_check();
323
324        assert_eq!(health.status, HealthStatus::Healthy);
325        assert_eq!(health.version, env!("CARGO_PKG_VERSION"));
326        // uptime_seconds is u64, so always >= 0, just check it exists
327        let _ = health.uptime_seconds;
328        assert_eq!(health.components.len(), 3); // storage, cache, metrics
329
330        // All components should be healthy
331        for component in &health.components {
332            assert!(matches!(
333                component.status,
334                HealthStatus::Healthy | HealthStatus::Degraded
335            ));
336        }
337    }
338
339    #[test]
340    fn test_health_metrics() {
341        let mut system = RoleSystem::new();
342
343        // Set up system
344        let role = Role::new("test_role").add_permission(Permission::new("read", "documents"));
345        system.register_role(role).unwrap();
346
347        let user = Subject::user("test_user");
348        system.assign_role(&user, "test_role").unwrap();
349
350        // Perform some operations to generate metrics
351        let resource = Resource::new("doc1", "documents");
352        let _ = system.check_permission(&user, "read", &resource);
353        let _ = system.check_permission(&user, "write", &resource);
354
355        let health = system.health_check();
356
357        assert!(health.metrics_summary.total_permission_checks >= 2);
358        assert_eq!(health.metrics_summary.total_roles, 1);
359        assert_eq!(health.metrics_summary.active_subjects, 1);
360    }
361
362    #[test]
363    fn test_is_healthy() {
364        let system = RoleSystem::new();
365        assert!(system.is_healthy());
366    }
367
368    #[test]
369    fn test_health_check_config() {
370        let config = HealthCheckConfig {
371            storage_timeout: Duration::from_millis(200),
372            cache_timeout: Duration::from_millis(100),
373            metrics_timeout: Duration::from_millis(50),
374        };
375
376        let system = RoleSystem::new();
377        let health = system.health_check_with_config(config);
378
379        assert_eq!(health.status, HealthStatus::Healthy);
380    }
381}