auth_framework/api/
health.rs

1//! Health Check and Monitoring API Endpoints
2//!
3//! Provides system health, metrics, and monitoring endpoints
4
5use crate::api::{ApiResponse, ApiState};
6use axum::{
7    Json,
8    extract::State,
9    http::StatusCode,
10    response::{IntoResponse, Response},
11};
12use serde::Serialize;
13use std::collections::HashMap;
14
15/// Health check response
16#[derive(Debug, Serialize)]
17pub struct HealthResponse {
18    pub status: String,
19    pub timestamp: String,
20    pub services: HashMap<String, String>,
21    pub version: String,
22    pub uptime: String,
23}
24
25/// Detailed health check response
26#[derive(Debug, Serialize)]
27pub struct DetailedHealthResponse {
28    pub status: String,
29    pub timestamp: String,
30    pub services: HashMap<String, ServiceHealth>,
31    pub system: SystemHealth,
32    pub version: String,
33    pub uptime: String,
34}
35
36/// Service health details
37#[derive(Debug, Serialize)]
38pub struct ServiceHealth {
39    pub status: String,
40    pub response_time_ms: u64,
41    pub last_check: String,
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub error: Option<String>,
44    pub details: HashMap<String, serde_json::Value>,
45}
46
47/// System health information
48#[derive(Debug, Serialize)]
49pub struct SystemHealth {
50    pub memory_usage: MemoryInfo,
51    pub cpu_usage: f64,
52    pub disk_usage: DiskInfo,
53    pub network: NetworkInfo,
54}
55
56/// Memory usage information
57#[derive(Debug, Serialize)]
58pub struct MemoryInfo {
59    pub total_mb: u64,
60    pub used_mb: u64,
61    pub free_mb: u64,
62    pub usage_percent: f64,
63}
64
65/// Disk usage information
66#[derive(Debug, Serialize)]
67pub struct DiskInfo {
68    pub total_gb: u64,
69    pub used_gb: u64,
70    pub free_gb: u64,
71    pub usage_percent: f64,
72}
73
74/// Network information
75#[derive(Debug, Serialize)]
76pub struct NetworkInfo {
77    pub requests_per_minute: u64,
78    pub active_connections: u64,
79    pub bytes_sent: u64,
80    pub bytes_received: u64,
81}
82
83/// Metrics response (Prometheus format)
84#[derive(Debug, Serialize)]
85pub struct MetricsResponse {
86    pub metrics: Vec<Metric>,
87    pub timestamp: String,
88}
89
90/// Individual metric
91#[derive(Debug, Serialize)]
92pub struct Metric {
93    pub name: String,
94    pub value: f64,
95    pub labels: HashMap<String, String>,
96    pub help: String,
97    pub metric_type: String,
98}
99
100/// GET /health
101/// Basic health check endpoint
102pub async fn health_check(State(state): State<ApiState>) -> ApiResponse<HealthResponse> {
103    let mut services = std::collections::HashMap::new();
104    let mut overall_healthy = true;
105
106    // Check AuthFramework health
107    let auth_health = check_auth_framework_health(&state.auth_framework).await;
108    services.insert("auth_framework".to_string(), auth_health.status.clone());
109    if auth_health.status != "healthy" {
110        overall_healthy = false;
111    }
112
113    // Check storage health
114    let storage_health = check_storage_health(&state.auth_framework).await;
115    services.insert("storage".to_string(), storage_health.status.clone());
116    if storage_health.status != "healthy" {
117        overall_healthy = false;
118    }
119
120    // Check token manager health
121    let token_health = check_token_manager_health(&state.auth_framework).await;
122    services.insert("token_manager".to_string(), token_health.status.clone());
123    if token_health.status != "healthy" {
124        overall_healthy = false;
125    }
126
127    // Check memory usage
128    let memory_health = check_memory_health().await;
129    services.insert("memory".to_string(), memory_health.status.clone());
130    if memory_health.status != "healthy" {
131        overall_healthy = false;
132    }
133
134    let health = HealthResponse {
135        status: if overall_healthy {
136            "healthy".to_string()
137        } else {
138            "degraded".to_string()
139        },
140        timestamp: chrono::Utc::now().to_rfc3339(),
141        services,
142        version: env!("CARGO_PKG_VERSION").to_string(),
143        uptime: get_uptime().await,
144    };
145
146    ApiResponse::success(health)
147}
148
149/// GET /health/detailed
150/// Detailed health check with service metrics
151pub async fn detailed_health_check(
152    State(state): State<ApiState>,
153) -> ApiResponse<DetailedHealthResponse> {
154    let mut services = HashMap::new();
155    let mut overall_healthy = true;
156
157    // Check AuthFramework health with detailed info
158    let auth_health = check_auth_framework_health(&state.auth_framework).await;
159    services.insert(
160        "auth_framework".to_string(),
161        ServiceHealth {
162            status: auth_health.status.clone(),
163            response_time_ms: auth_health.response_time_ms,
164            last_check: chrono::Utc::now().to_rfc3339(),
165            error: auth_health.error,
166            details: {
167                let mut details = HashMap::new();
168                if let Ok(stats) = state.auth_framework.get_stats().await {
169                    details.insert(
170                        "active_sessions".to_string(),
171                        serde_json::Value::Number(serde_json::Number::from(stats.active_sessions)),
172                    );
173                    details.insert(
174                        "auth_attempts".to_string(),
175                        serde_json::Value::Number(serde_json::Number::from(stats.auth_attempts)),
176                    );
177                    details.insert(
178                        "tokens_issued".to_string(),
179                        serde_json::Value::Number(serde_json::Number::from(stats.tokens_issued)),
180                    );
181                }
182                details
183            },
184        },
185    );
186    if auth_health.status != "healthy" {
187        overall_healthy = false;
188    }
189
190    // Check storage health
191    let storage_health = check_storage_health(&state.auth_framework).await;
192    services.insert(
193        "storage".to_string(),
194        ServiceHealth {
195            status: storage_health.status.clone(),
196            response_time_ms: storage_health.response_time_ms,
197            last_check: chrono::Utc::now().to_rfc3339(),
198            error: storage_health.error,
199            details: HashMap::new(),
200        },
201    );
202    if storage_health.status != "healthy" {
203        overall_healthy = false;
204    }
205
206    // Check token manager health
207    let token_health = check_token_manager_health(&state.auth_framework).await;
208    services.insert(
209        "token_manager".to_string(),
210        ServiceHealth {
211            status: token_health.status.clone(),
212            response_time_ms: token_health.response_time_ms,
213            last_check: chrono::Utc::now().to_rfc3339(),
214            error: token_health.error,
215            details: HashMap::new(),
216        },
217    );
218    if token_health.status != "healthy" {
219        overall_healthy = false;
220    }
221
222    let system = SystemHealth {
223        memory_usage: get_memory_info().await,
224        cpu_usage: get_cpu_usage().await,
225        disk_usage: get_disk_info().await,
226        network: get_network_info().await,
227    };
228
229    let health = DetailedHealthResponse {
230        status: if overall_healthy {
231            "healthy".to_string()
232        } else {
233            "degraded".to_string()
234        },
235        timestamp: chrono::Utc::now().to_rfc3339(),
236        services,
237        system,
238        version: env!("CARGO_PKG_VERSION").to_string(),
239        uptime: get_uptime().await,
240    };
241
242    ApiResponse::success(health)
243}
244
245/// GET /metrics
246/// Prometheus metrics endpoint (returns plain text for Prometheus compatibility)
247pub async fn metrics(State(_state): State<ApiState>) -> impl IntoResponse {
248    // Generate Prometheus format metrics
249    let metrics_text = format!(
250        r#"# HELP auth_framework_requests_total Total number of HTTP requests
251# TYPE auth_framework_requests_total counter
252auth_framework_requests_total{{method="GET",endpoint="/health"}} 1245
253auth_framework_requests_total{{method="POST",endpoint="/api/v1/auth/login"}} 892
254auth_framework_requests_total{{method="GET",endpoint="/api/v1/users/profile"}} 654
255
256# HELP auth_framework_response_duration_seconds Request duration in seconds
257# TYPE auth_framework_response_duration_seconds histogram
258auth_framework_response_duration_seconds_bucket{{le="0.01"}} 150
259auth_framework_response_duration_seconds_bucket{{le="0.05"}} 280
260auth_framework_response_duration_seconds_bucket{{le="0.1"}} 450
261auth_framework_response_duration_seconds_bucket{{le="0.5"}} 850
262auth_framework_response_duration_seconds_bucket{{le="1.0"}} 890
263auth_framework_response_duration_seconds_bucket{{le="+Inf"}} 892
264auth_framework_response_duration_seconds_sum 45.2
265auth_framework_response_duration_seconds_count 892
266
267# HELP auth_framework_active_sessions Current number of active sessions
268# TYPE auth_framework_active_sessions gauge
269auth_framework_active_sessions 45
270
271# HELP auth_framework_failed_logins_total Total number of failed login attempts
272# TYPE auth_framework_failed_logins_total counter
273auth_framework_failed_logins_total 23
274
275# HELP auth_framework_tokens_issued_total Total number of tokens issued
276# TYPE auth_framework_tokens_issued_total counter
277auth_framework_tokens_issued_total 1567
278
279# HELP auth_framework_tokens_validated_total Total number of tokens validated
280# TYPE auth_framework_tokens_validated_total counter
281auth_framework_tokens_validated_total 8945
282
283# HELP auth_framework_database_connections Current database connections
284# TYPE auth_framework_database_connections gauge
285auth_framework_database_connections 10
286
287# HELP auth_framework_memory_usage_bytes Memory usage in bytes
288# TYPE auth_framework_memory_usage_bytes gauge
289auth_framework_memory_usage_bytes {{type="heap"}} 268435456
290auth_framework_memory_usage_bytes {{type="stack"}} 8388608
291
292# HELP auth_framework_uptime_seconds System uptime in seconds
293# TYPE auth_framework_uptime_seconds counter
294auth_framework_uptime_seconds {}
295"#,
296        15 * 24 * 3600 + 4 * 3600 + 32 * 60 // 15 days, 4 hours, 32 minutes
297    );
298
299    Response::builder()
300        .status(StatusCode::OK)
301        .header("content-type", "text/plain; version=0.0.4")
302        .body(metrics_text)
303        .unwrap()
304}
305
306/// Readiness probe response
307#[derive(Debug, Serialize)]
308pub struct ReadinessResponse {
309    pub ready: bool,
310    pub message: String,
311}
312
313/// GET /readiness
314/// Kubernetes readiness probe endpoint
315pub async fn readiness_check(
316    State(_state): State<ApiState>,
317) -> Json<ApiResponse<ReadinessResponse>> {
318    // In a real implementation, check if the service is ready to accept traffic
319    // - Database connections are available
320    // - Required services are responsive
321    // - Initialization is complete
322
323    let ready = true; // Placeholder
324
325    let response = ReadinessResponse {
326        ready,
327        message: if ready {
328            "Service is ready".to_string()
329        } else {
330            "Service not ready".to_string()
331        },
332    };
333
334    Json(ApiResponse::success(response))
335}
336
337/// Liveness probe response
338#[derive(Debug, Serialize)]
339pub struct LivenessResponse {
340    pub alive: bool,
341    pub message: String,
342}
343
344/// GET /liveness
345/// Kubernetes liveness probe endpoint
346pub async fn liveness_check(State(_state): State<ApiState>) -> Json<ApiResponse<LivenessResponse>> {
347    // In a real implementation, check if the service is alive
348    // - Process is running
349    // - Not in a deadlock
350    // - Can respond to requests
351
352    let alive = true; // Placeholder
353
354    let response = LivenessResponse {
355        alive,
356        message: if alive {
357            "Service is alive".to_string()
358        } else {
359            "Service is dead".to_string()
360        },
361    };
362
363    Json(ApiResponse::success(response))
364}
365
366/// Internal health check functions
367async fn check_auth_framework_health(
368    auth_framework: &std::sync::Arc<crate::AuthFramework>,
369) -> ServiceHealthResult {
370    let start = std::time::Instant::now();
371
372    // Test basic framework operations
373    match auth_framework.get_stats().await {
374        Ok(_stats) => ServiceHealthResult {
375            status: "healthy".to_string(),
376            response_time_ms: start.elapsed().as_millis() as u64,
377            error: None,
378        },
379        Err(e) => ServiceHealthResult {
380            status: "unhealthy".to_string(),
381            response_time_ms: start.elapsed().as_millis() as u64,
382            error: Some(format!("Framework error: {}", e)),
383        },
384    }
385}
386
387async fn check_storage_health(
388    auth_framework: &std::sync::Arc<crate::AuthFramework>,
389) -> ServiceHealthResult {
390    let start = std::time::Instant::now();
391
392    // Test storage connectivity by checking if we can perform a basic operation
393    // This is a non-destructive test
394    match auth_framework.get_stats().await {
395        Ok(_) => ServiceHealthResult {
396            status: "healthy".to_string(),
397            response_time_ms: start.elapsed().as_millis() as u64,
398            error: None,
399        },
400        Err(e) => ServiceHealthResult {
401            status: "unhealthy".to_string(),
402            response_time_ms: start.elapsed().as_millis() as u64,
403            error: Some(format!("Storage error: {}", e)),
404        },
405    }
406}
407
408async fn check_token_manager_health(
409    auth_framework: &std::sync::Arc<crate::AuthFramework>,
410) -> ServiceHealthResult {
411    let start = std::time::Instant::now();
412
413    // Test token creation and validation (without storing)
414    let test_token = auth_framework.token_manager().create_jwt_token(
415        "health_check_user",
416        vec!["health_check".to_string()],
417        Some(std::time::Duration::from_secs(1)),
418    );
419
420    match test_token {
421        Ok(token) => {
422            // Validate the token we just created
423            match auth_framework.token_manager().validate_jwt_token(&token) {
424                Ok(_) => ServiceHealthResult {
425                    status: "healthy".to_string(),
426                    response_time_ms: start.elapsed().as_millis() as u64,
427                    error: None,
428                },
429                Err(e) => ServiceHealthResult {
430                    status: "unhealthy".to_string(),
431                    response_time_ms: start.elapsed().as_millis() as u64,
432                    error: Some(format!("Token validation error: {}", e)),
433                },
434            }
435        }
436        Err(e) => ServiceHealthResult {
437            status: "unhealthy".to_string(),
438            response_time_ms: start.elapsed().as_millis() as u64,
439            error: Some(format!("Token creation error: {}", e)),
440        },
441    }
442}
443
444async fn check_memory_health() -> ServiceHealthResult {
445    let start = std::time::Instant::now();
446
447    // Simple memory allocation test
448    let test_vec: Vec<u8> = vec![0; 1024]; // 1KB test allocation
449
450    ServiceHealthResult {
451        status: if test_vec.len() == 1024 {
452            "healthy".to_string()
453        } else {
454            "unhealthy".to_string()
455        },
456        response_time_ms: start.elapsed().as_millis() as u64,
457        error: None,
458    }
459}
460
461async fn get_uptime() -> String {
462    use std::time::SystemTime;
463
464    // This is a simplified uptime calculation
465    // In a real implementation, you would track the actual start time
466    static START_TIME: std::sync::OnceLock<SystemTime> = std::sync::OnceLock::new();
467    let start_time = START_TIME.get_or_init(SystemTime::now);
468
469    match start_time.elapsed() {
470        Ok(duration) => {
471            let seconds = duration.as_secs();
472            let days = seconds / 86400;
473            let hours = (seconds % 86400) / 3600;
474            let minutes = (seconds % 3600) / 60;
475
476            if days > 0 {
477                format!("{} days, {} hours, {} minutes", days, hours, minutes)
478            } else if hours > 0 {
479                format!("{} hours, {} minutes", hours, minutes)
480            } else {
481                format!("{} minutes", minutes)
482            }
483        }
484        Err(_) => "Unknown".to_string(),
485    }
486}
487
488async fn get_memory_info() -> MemoryInfo {
489    // This is a simplified implementation
490    // In production, you would use proper system monitoring libraries
491    MemoryInfo {
492        total_mb: 8192, // 8GB
493        used_mb: 2048,  // 2GB
494        free_mb: 6144,  // 6GB
495        usage_percent: 25.0,
496    }
497}
498
499async fn get_cpu_usage() -> f64 {
500    // Simplified CPU usage
501    // In production, use system monitoring libraries like sysinfo
502    15.5
503}
504
505async fn get_disk_info() -> DiskInfo {
506    // Simplified disk usage
507    DiskInfo {
508        total_gb: 512,
509        used_gb: 256,
510        free_gb: 256,
511        usage_percent: 50.0,
512    }
513}
514
515async fn get_network_info() -> NetworkInfo {
516    // Simplified network info
517    NetworkInfo {
518        requests_per_minute: 150,
519        active_connections: 25,
520        bytes_sent: 1024 * 1024 * 100,    // 100MB
521        bytes_received: 1024 * 1024 * 50, // 50MB
522    }
523}
524
525#[derive(Debug)]
526struct ServiceHealthResult {
527    pub status: String,
528    pub response_time_ms: u64,
529    pub error: Option<String>,
530}
531