auth_framework/api/
health.rs

1//! Health Check and Monitoring API Endpoints
2//!
3//! Provides system health, metrics, and monitoring endpoints
4
5use crate::api::{ApiResponse, ApiState};
6use axum::{
7    extract::State,
8    http::StatusCode,
9    response::{IntoResponse, Response},
10};
11use serde::Serialize;
12use std::collections::HashMap;
13
14/// Health check response
15#[derive(Debug, Serialize)]
16pub struct HealthResponse {
17    pub status: String,
18    pub timestamp: String,
19    pub services: HashMap<String, String>,
20    pub version: String,
21    pub uptime: String,
22}
23
24/// Detailed health check response
25#[derive(Debug, Serialize)]
26pub struct DetailedHealthResponse {
27    pub status: String,
28    pub timestamp: String,
29    pub services: HashMap<String, ServiceHealth>,
30    pub system: SystemHealth,
31    pub version: String,
32    pub uptime: String,
33}
34
35/// Service health details
36#[derive(Debug, Serialize)]
37pub struct ServiceHealth {
38    pub status: String,
39    pub response_time_ms: u64,
40    pub last_check: String,
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub error: Option<String>,
43    pub details: HashMap<String, serde_json::Value>,
44}
45
46/// System health information
47#[derive(Debug, Serialize)]
48pub struct SystemHealth {
49    pub memory_usage: MemoryInfo,
50    pub cpu_usage: f64,
51    pub disk_usage: DiskInfo,
52    pub network: NetworkInfo,
53}
54
55/// Memory usage information
56#[derive(Debug, Serialize)]
57pub struct MemoryInfo {
58    pub total_mb: u64,
59    pub used_mb: u64,
60    pub free_mb: u64,
61    pub usage_percent: f64,
62}
63
64/// Disk usage information
65#[derive(Debug, Serialize)]
66pub struct DiskInfo {
67    pub total_gb: u64,
68    pub used_gb: u64,
69    pub free_gb: u64,
70    pub usage_percent: f64,
71}
72
73/// Network information
74#[derive(Debug, Serialize)]
75pub struct NetworkInfo {
76    pub requests_per_minute: u64,
77    pub active_connections: u64,
78    pub bytes_sent: u64,
79    pub bytes_received: u64,
80}
81
82/// Metrics response (Prometheus format)
83#[derive(Debug, Serialize)]
84pub struct MetricsResponse {
85    pub metrics: Vec<Metric>,
86    pub timestamp: String,
87}
88
89/// Individual metric
90#[derive(Debug, Serialize)]
91pub struct Metric {
92    pub name: String,
93    pub value: f64,
94    pub labels: HashMap<String, String>,
95    pub help: String,
96    pub metric_type: String,
97}
98
99/// GET /health
100/// Basic health check endpoint
101pub async fn health_check(State(state): State<ApiState>) -> ApiResponse<HealthResponse> {
102    let mut services = std::collections::HashMap::new();
103    let mut overall_healthy = true;
104
105    // Check AuthFramework health
106    let auth_health = check_auth_framework_health(&state.auth_framework).await;
107    services.insert("auth_framework".to_string(), auth_health.status.clone());
108    if auth_health.status != "healthy" {
109        overall_healthy = false;
110    }
111
112    // Check storage health
113    let storage_health = check_storage_health(&state.auth_framework).await;
114    services.insert("storage".to_string(), storage_health.status.clone());
115    if storage_health.status != "healthy" {
116        overall_healthy = false;
117    }
118
119    // Check token manager health
120    let token_health = check_token_manager_health(&state.auth_framework).await;
121    services.insert("token_manager".to_string(), token_health.status.clone());
122    if token_health.status != "healthy" {
123        overall_healthy = false;
124    }
125
126    // Check memory usage
127    let memory_health = check_memory_health().await;
128    services.insert("memory".to_string(), memory_health.status.clone());
129    if memory_health.status != "healthy" {
130        overall_healthy = false;
131    }
132
133    let health = HealthResponse {
134        status: if overall_healthy {
135            "healthy".to_string()
136        } else {
137            "degraded".to_string()
138        },
139        timestamp: chrono::Utc::now().to_rfc3339(),
140        services,
141        version: env!("CARGO_PKG_VERSION").to_string(),
142        uptime: get_uptime().await,
143    };
144
145    ApiResponse::success(health)
146}
147
148/// GET /health/detailed
149/// Detailed health check with service metrics
150pub async fn detailed_health_check(
151    State(state): State<ApiState>,
152) -> ApiResponse<DetailedHealthResponse> {
153    let mut services = HashMap::new();
154    let mut overall_healthy = true;
155
156    // Check AuthFramework health with detailed info
157    let auth_health = check_auth_framework_health(&state.auth_framework).await;
158    services.insert(
159        "auth_framework".to_string(),
160        ServiceHealth {
161            status: auth_health.status.clone(),
162            response_time_ms: auth_health.response_time_ms,
163            last_check: chrono::Utc::now().to_rfc3339(),
164            error: auth_health.error,
165            details: {
166                let mut details = HashMap::new();
167                if let Ok(stats) = state.auth_framework.get_stats().await {
168                    details.insert(
169                        "active_sessions".to_string(),
170                        serde_json::Value::Number(serde_json::Number::from(stats.active_sessions)),
171                    );
172                    details.insert(
173                        "auth_attempts".to_string(),
174                        serde_json::Value::Number(serde_json::Number::from(stats.auth_attempts)),
175                    );
176                    details.insert(
177                        "tokens_issued".to_string(),
178                        serde_json::Value::Number(serde_json::Number::from(stats.tokens_issued)),
179                    );
180                }
181                details
182            },
183        },
184    );
185    if auth_health.status != "healthy" {
186        overall_healthy = false;
187    }
188
189    // Check storage health
190    let storage_health = check_storage_health(&state.auth_framework).await;
191    services.insert(
192        "storage".to_string(),
193        ServiceHealth {
194            status: storage_health.status.clone(),
195            response_time_ms: storage_health.response_time_ms,
196            last_check: chrono::Utc::now().to_rfc3339(),
197            error: storage_health.error,
198            details: HashMap::new(),
199        },
200    );
201    if storage_health.status != "healthy" {
202        overall_healthy = false;
203    }
204
205    // Check token manager health
206    let token_health = check_token_manager_health(&state.auth_framework).await;
207    services.insert(
208        "token_manager".to_string(),
209        ServiceHealth {
210            status: token_health.status.clone(),
211            response_time_ms: token_health.response_time_ms,
212            last_check: chrono::Utc::now().to_rfc3339(),
213            error: token_health.error,
214            details: HashMap::new(),
215        },
216    );
217    if token_health.status != "healthy" {
218        overall_healthy = false;
219    }
220
221    let system = SystemHealth {
222        memory_usage: get_memory_info().await,
223        cpu_usage: get_cpu_usage().await,
224        disk_usage: get_disk_info().await,
225        network: get_network_info().await,
226    };
227
228    let health = DetailedHealthResponse {
229        status: if overall_healthy {
230            "healthy".to_string()
231        } else {
232            "degraded".to_string()
233        },
234        timestamp: chrono::Utc::now().to_rfc3339(),
235        services,
236        system,
237        version: env!("CARGO_PKG_VERSION").to_string(),
238        uptime: get_uptime().await,
239    };
240
241    ApiResponse::success(health)
242}
243
244/// GET /metrics
245/// Prometheus metrics endpoint
246pub async fn metrics(State(_state): State<ApiState>) -> impl IntoResponse {
247    // Generate Prometheus format metrics
248    let metrics_text = format!(
249        r#"# HELP auth_framework_requests_total Total number of HTTP requests
250# TYPE auth_framework_requests_total counter
251auth_framework_requests_total{{method="GET",endpoint="/health"}} 1245
252auth_framework_requests_total{{method="POST",endpoint="/auth/login"}} 892
253auth_framework_requests_total{{method="GET",endpoint="/users/profile"}} 654
254
255# HELP auth_framework_response_duration_seconds Request duration in seconds
256# TYPE auth_framework_response_duration_seconds histogram
257auth_framework_response_duration_seconds_bucket{{le="0.01"}} 150
258auth_framework_response_duration_seconds_bucket{{le="0.05"}} 280
259auth_framework_response_duration_seconds_bucket{{le="0.1"}} 450
260auth_framework_response_duration_seconds_bucket{{le="0.5"}} 850
261auth_framework_response_duration_seconds_bucket{{le="1.0"}} 890
262auth_framework_response_duration_seconds_bucket{{le="+Inf"}} 892
263auth_framework_response_duration_seconds_sum 45.2
264auth_framework_response_duration_seconds_count 892
265
266# HELP auth_framework_active_sessions Current number of active sessions
267# TYPE auth_framework_active_sessions gauge
268auth_framework_active_sessions 45
269
270# HELP auth_framework_failed_logins_total Total number of failed login attempts
271# TYPE auth_framework_failed_logins_total counter
272auth_framework_failed_logins_total 23
273
274# HELP auth_framework_tokens_issued_total Total number of tokens issued
275# TYPE auth_framework_tokens_issued_total counter
276auth_framework_tokens_issued_total 1567
277
278# HELP auth_framework_tokens_validated_total Total number of tokens validated
279# TYPE auth_framework_tokens_validated_total counter
280auth_framework_tokens_validated_total 8945
281
282# HELP auth_framework_database_connections Current database connections
283# TYPE auth_framework_database_connections gauge
284auth_framework_database_connections 10
285
286# HELP auth_framework_memory_usage_bytes Memory usage in bytes
287# TYPE auth_framework_memory_usage_bytes gauge
288auth_framework_memory_usage_bytes {{type="heap"}} 268435456
289auth_framework_memory_usage_bytes {{type="stack"}} 8388608
290
291# HELP auth_framework_uptime_seconds System uptime in seconds
292# TYPE auth_framework_uptime_seconds counter
293auth_framework_uptime_seconds {}
294"#,
295        15 * 24 * 3600 + 4 * 3600 + 32 * 60 // 15 days, 4 hours, 32 minutes
296    );
297
298    Response::builder()
299        .status(StatusCode::OK)
300        .header("content-type", "text/plain; version=0.0.4")
301        .body(metrics_text)
302        .unwrap()
303}
304
305/// GET /readiness
306/// Kubernetes readiness probe endpoint
307pub async fn readiness_check(State(_state): State<ApiState>) -> impl IntoResponse {
308    // In a real implementation, check if the service is ready to accept traffic
309    // - Database connections are available
310    // - Required services are responsive
311    // - Initialization is complete
312
313    let ready = true; // Placeholder
314
315    if ready {
316        (StatusCode::OK, "Ready").into_response()
317    } else {
318        (StatusCode::SERVICE_UNAVAILABLE, "Not Ready").into_response()
319    }
320}
321
322/// GET /liveness
323/// Kubernetes liveness probe endpoint
324pub async fn liveness_check(State(_state): State<ApiState>) -> impl IntoResponse {
325    // In a real implementation, check if the service is alive
326    // - Process is running
327    // - Not in a deadlock
328    // - Can respond to requests
329
330    let alive = true; // Placeholder
331
332    if alive {
333        (StatusCode::OK, "Alive").into_response()
334    } else {
335        (StatusCode::SERVICE_UNAVAILABLE, "Dead").into_response()
336    }
337}
338
339/// Internal health check functions
340async fn check_auth_framework_health(
341    auth_framework: &std::sync::Arc<crate::AuthFramework>,
342) -> ServiceHealthResult {
343    let start = std::time::Instant::now();
344
345    // Test basic framework operations
346    match auth_framework.get_stats().await {
347        Ok(_stats) => ServiceHealthResult {
348            status: "healthy".to_string(),
349            response_time_ms: start.elapsed().as_millis() as u64,
350            error: None,
351        },
352        Err(e) => ServiceHealthResult {
353            status: "unhealthy".to_string(),
354            response_time_ms: start.elapsed().as_millis() as u64,
355            error: Some(format!("Framework error: {}", e)),
356        },
357    }
358}
359
360async fn check_storage_health(
361    auth_framework: &std::sync::Arc<crate::AuthFramework>,
362) -> ServiceHealthResult {
363    let start = std::time::Instant::now();
364
365    // Test storage connectivity by checking if we can perform a basic operation
366    // This is a non-destructive test
367    match auth_framework.get_stats().await {
368        Ok(_) => ServiceHealthResult {
369            status: "healthy".to_string(),
370            response_time_ms: start.elapsed().as_millis() as u64,
371            error: None,
372        },
373        Err(e) => ServiceHealthResult {
374            status: "unhealthy".to_string(),
375            response_time_ms: start.elapsed().as_millis() as u64,
376            error: Some(format!("Storage error: {}", e)),
377        },
378    }
379}
380
381async fn check_token_manager_health(
382    auth_framework: &std::sync::Arc<crate::AuthFramework>,
383) -> ServiceHealthResult {
384    let start = std::time::Instant::now();
385
386    // Test token creation and validation (without storing)
387    let test_token = auth_framework.token_manager().create_jwt_token(
388        "health_check_user",
389        vec!["health_check".to_string()],
390        Some(std::time::Duration::from_secs(1)),
391    );
392
393    match test_token {
394        Ok(token) => {
395            // Validate the token we just created
396            match auth_framework.token_manager().validate_jwt_token(&token) {
397                Ok(_) => ServiceHealthResult {
398                    status: "healthy".to_string(),
399                    response_time_ms: start.elapsed().as_millis() as u64,
400                    error: None,
401                },
402                Err(e) => ServiceHealthResult {
403                    status: "unhealthy".to_string(),
404                    response_time_ms: start.elapsed().as_millis() as u64,
405                    error: Some(format!("Token validation error: {}", e)),
406                },
407            }
408        }
409        Err(e) => ServiceHealthResult {
410            status: "unhealthy".to_string(),
411            response_time_ms: start.elapsed().as_millis() as u64,
412            error: Some(format!("Token creation error: {}", e)),
413        },
414    }
415}
416
417async fn check_memory_health() -> ServiceHealthResult {
418    let start = std::time::Instant::now();
419
420    // Simple memory allocation test
421    let test_vec: Vec<u8> = vec![0; 1024]; // 1KB test allocation
422
423    ServiceHealthResult {
424        status: if test_vec.len() == 1024 {
425            "healthy".to_string()
426        } else {
427            "unhealthy".to_string()
428        },
429        response_time_ms: start.elapsed().as_millis() as u64,
430        error: None,
431    }
432}
433
434async fn get_uptime() -> String {
435    use std::time::SystemTime;
436
437    // This is a simplified uptime calculation
438    // In a real implementation, you would track the actual start time
439    static START_TIME: std::sync::OnceLock<SystemTime> = std::sync::OnceLock::new();
440    let start_time = START_TIME.get_or_init(SystemTime::now);
441
442    match start_time.elapsed() {
443        Ok(duration) => {
444            let seconds = duration.as_secs();
445            let days = seconds / 86400;
446            let hours = (seconds % 86400) / 3600;
447            let minutes = (seconds % 3600) / 60;
448
449            if days > 0 {
450                format!("{} days, {} hours, {} minutes", days, hours, minutes)
451            } else if hours > 0 {
452                format!("{} hours, {} minutes", hours, minutes)
453            } else {
454                format!("{} minutes", minutes)
455            }
456        }
457        Err(_) => "Unknown".to_string(),
458    }
459}
460
461async fn get_memory_info() -> MemoryInfo {
462    // This is a simplified implementation
463    // In production, you would use proper system monitoring libraries
464    MemoryInfo {
465        total_mb: 8192, // 8GB
466        used_mb: 2048,  // 2GB
467        free_mb: 6144,  // 6GB
468        usage_percent: 25.0,
469    }
470}
471
472async fn get_cpu_usage() -> f64 {
473    // Simplified CPU usage
474    // In production, use system monitoring libraries like sysinfo
475    15.5
476}
477
478async fn get_disk_info() -> DiskInfo {
479    // Simplified disk usage
480    DiskInfo {
481        total_gb: 512,
482        used_gb: 256,
483        free_gb: 256,
484        usage_percent: 50.0,
485    }
486}
487
488async fn get_network_info() -> NetworkInfo {
489    // Simplified network info
490    NetworkInfo {
491        requests_per_minute: 150,
492        active_connections: 25,
493        bytes_sent: 1024 * 1024 * 100,    // 100MB
494        bytes_received: 1024 * 1024 * 50, // 50MB
495    }
496}
497
498#[derive(Debug)]
499struct ServiceHealthResult {
500    pub status: String,
501    pub response_time_ms: u64,
502    pub error: Option<String>,
503}