Skip to main content

auth_framework/api/
health.rs

1//! Health Check and Monitoring API Endpoints
2//!
3//! Provides system health, metrics, and monitoring endpoints
4
5use crate::api::{ApiResponse, ApiState};
6use axum::{
7    extract::State,
8    http::StatusCode,
9    response::{IntoResponse, Response},
10};
11use serde::Serialize;
12use std::collections::HashMap;
13
14/// Basic health check response.
15#[derive(Debug, Serialize)]
16pub struct HealthResponse {
17    /// Overall status: `"healthy"` or `"degraded"`.
18    pub status: String,
19    /// ISO-8601 timestamp of the check.
20    pub timestamp: String,
21    /// Per-service status summary.
22    pub services: HashMap<String, String>,
23    /// Crate version.
24    pub version: String,
25    /// Human-readable server uptime (e.g. `"3h 12m"`).
26    pub uptime: String,
27}
28
29/// Extended health check response including per-service latency and system resource usage.
30#[derive(Debug, Serialize)]
31pub struct DetailedHealthResponse {
32    pub status: String,
33    pub timestamp: String,
34    pub services: HashMap<String, ServiceHealth>,
35    pub system: SystemHealth,
36    pub version: String,
37    pub uptime: String,
38}
39
40/// Per-service health details.
41#[derive(Debug, Serialize)]
42pub struct ServiceHealth {
43    /// `"healthy"`, `"degraded"`, or `"unhealthy"`.
44    pub status: String,
45    /// Round-trip check latency in milliseconds.
46    pub response_time_ms: u64,
47    /// ISO-8601 timestamp of the last probe.
48    pub last_check: String,
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub error: Option<String>,
51    pub details: HashMap<String, serde_json::Value>,
52}
53
54/// Aggregate system resource usage.
55#[derive(Debug, Serialize)]
56pub struct SystemHealth {
57    pub memory_usage: MemoryInfo,
58    pub cpu_usage: f64,
59    pub disk_usage: DiskInfo,
60    pub network: NetworkInfo,
61}
62
63/// Process memory usage.
64#[derive(Debug, Serialize)]
65pub struct MemoryInfo {
66    pub total_mb: u64,
67    pub used_mb: u64,
68    pub free_mb: u64,
69    pub usage_percent: f64,
70}
71
72/// Disk usage statistics.
73#[derive(Debug, Serialize)]
74pub struct DiskInfo {
75    pub total_gb: u64,
76    pub used_gb: u64,
77    pub free_gb: u64,
78    pub usage_percent: f64,
79}
80
81/// Network traffic counters.
82#[derive(Debug, Serialize)]
83pub struct NetworkInfo {
84    pub requests_per_minute: u64,
85    pub active_connections: u64,
86    pub bytes_sent: u64,
87    pub bytes_received: u64,
88}
89
90/// Container for exported Prometheus-style metrics.
91#[derive(Debug, Serialize)]
92pub struct MetricsResponse {
93    pub metrics: Vec<Metric>,
94    pub timestamp: String,
95}
96
97/// A single labeled metric.
98#[derive(Debug, Serialize)]
99pub struct Metric {
100    pub name: String,
101    pub value: f64,
102    pub labels: HashMap<String, String>,
103    pub help: String,
104    pub metric_type: String,
105}
106
107/// `GET /health` — lightweight health check returning overall status and per-service summary.
108pub async fn health_check(State(state): State<ApiState>) -> ApiResponse<HealthResponse> {
109    let mut services = std::collections::HashMap::new();
110    let mut overall_healthy = true;
111
112    // Check AuthFramework health
113    let auth_health = check_auth_framework_health(&state.auth_framework).await;
114    services.insert("auth_framework".to_string(), auth_health.status.clone());
115    if auth_health.status != "healthy" {
116        overall_healthy = false;
117    }
118
119    // Check storage health
120    let storage_health = check_storage_health(&state.auth_framework).await;
121    services.insert("storage".to_string(), storage_health.status.clone());
122    if storage_health.status != "healthy" {
123        overall_healthy = false;
124    }
125
126    // Check token manager health
127    let token_health = check_token_manager_health(&state.auth_framework).await;
128    services.insert("token_manager".to_string(), token_health.status.clone());
129    if token_health.status != "healthy" {
130        overall_healthy = false;
131    }
132
133    // Check memory usage
134    let memory_health = check_memory_health().await;
135    services.insert("memory".to_string(), memory_health.status.clone());
136    if memory_health.status != "healthy" {
137        overall_healthy = false;
138    }
139
140    let health = HealthResponse {
141        status: if overall_healthy {
142            "healthy".to_string()
143        } else {
144            "degraded".to_string()
145        },
146        timestamp: chrono::Utc::now().to_rfc3339(),
147        services,
148        version: env!("CARGO_PKG_VERSION").to_string(),
149        uptime: get_uptime().await,
150    };
151
152    ApiResponse::success(health)
153}
154
155/// `GET /health/detailed` — extended health check with latency measurements and system resource usage.
156pub async fn detailed_health_check(
157    State(state): State<ApiState>,
158) -> ApiResponse<DetailedHealthResponse> {
159    let mut services = HashMap::new();
160    let mut overall_healthy = true;
161
162    // Check AuthFramework health with detailed info
163    let auth_health = check_auth_framework_health(&state.auth_framework).await;
164    services.insert(
165        "auth_framework".to_string(),
166        ServiceHealth {
167            status: auth_health.status.clone(),
168            response_time_ms: auth_health.response_time_ms,
169            last_check: chrono::Utc::now().to_rfc3339(),
170            error: auth_health.error,
171            details: {
172                let mut details = HashMap::new();
173                if let Ok(stats) = state.auth_framework.get_stats().await {
174                    details.insert(
175                        "active_sessions".to_string(),
176                        serde_json::Value::Number(serde_json::Number::from(stats.active_sessions)),
177                    );
178                    details.insert(
179                        "auth_attempts".to_string(),
180                        serde_json::Value::Number(serde_json::Number::from(stats.auth_attempts)),
181                    );
182                    details.insert(
183                        "tokens_issued".to_string(),
184                        serde_json::Value::Number(serde_json::Number::from(stats.tokens_issued)),
185                    );
186                }
187                details
188            },
189        },
190    );
191    if auth_health.status != "healthy" {
192        overall_healthy = false;
193    }
194
195    // Check storage health
196    let storage_health = check_storage_health(&state.auth_framework).await;
197    services.insert(
198        "storage".to_string(),
199        ServiceHealth {
200            status: storage_health.status.clone(),
201            response_time_ms: storage_health.response_time_ms,
202            last_check: chrono::Utc::now().to_rfc3339(),
203            error: storage_health.error,
204            details: HashMap::new(),
205        },
206    );
207    if storage_health.status != "healthy" {
208        overall_healthy = false;
209    }
210
211    // Check token manager health
212    let token_health = check_token_manager_health(&state.auth_framework).await;
213    services.insert(
214        "token_manager".to_string(),
215        ServiceHealth {
216            status: token_health.status.clone(),
217            response_time_ms: token_health.response_time_ms,
218            last_check: chrono::Utc::now().to_rfc3339(),
219            error: token_health.error,
220            details: HashMap::new(),
221        },
222    );
223    if token_health.status != "healthy" {
224        overall_healthy = false;
225    }
226
227    let system = SystemHealth {
228        memory_usage: get_memory_info().await,
229        cpu_usage: get_cpu_usage().await,
230        disk_usage: get_disk_info().await,
231        network: get_network_info().await,
232    };
233
234    let health = DetailedHealthResponse {
235        status: if overall_healthy {
236            "healthy".to_string()
237        } else {
238            "degraded".to_string()
239        },
240        timestamp: chrono::Utc::now().to_rfc3339(),
241        services,
242        system,
243        version: env!("CARGO_PKG_VERSION").to_string(),
244        uptime: get_uptime().await,
245    };
246
247    ApiResponse::success(health)
248}
249
250/// `GET /metrics` — export metrics in Prometheus text exposition format.
251pub async fn metrics(State(state): State<ApiState>) -> impl IntoResponse {
252    let metrics_text = state.auth_framework.export_prometheus_metrics().await;
253
254    Response::builder()
255        .status(StatusCode::OK)
256        .header("content-type", "text/plain; version=0.0.4")
257        .body(metrics_text)
258        .expect("infallible: String body is always valid")
259}
260
261/// `GET /readiness` — Kubernetes readiness probe (200 when able to serve traffic).
262pub async fn readiness_check(State(state): State<ApiState>) -> impl IntoResponse {
263    // Check if the auth framework is ready to accept traffic by trying to get stats.
264    // A successful stats call confirms storage, token manager, and core services are up.
265    let ready = state.auth_framework.get_stats().await.is_ok();
266
267    if ready {
268        (StatusCode::OK, "Ready").into_response()
269    } else {
270        (StatusCode::SERVICE_UNAVAILABLE, "Not Ready").into_response()
271    }
272}
273
274/// `GET /liveness` — Kubernetes liveness probe (200 if the async runtime is responsive).
275pub async fn liveness_check(State(state): State<ApiState>) -> impl IntoResponse {
276    // Verify the service can perform a basic operation — completing the await on
277    // get_performance_metrics confirms the async runtime is not deadlocked.
278    state.auth_framework.get_performance_metrics().await;
279    (StatusCode::OK, "Alive").into_response()
280}
281
282/// Internal health check functions
283async fn check_auth_framework_health(
284    auth_framework: &std::sync::Arc<crate::AuthFramework>,
285) -> ServiceHealthResult {
286    let start = std::time::Instant::now();
287
288    // Test basic framework operations
289    match auth_framework.get_stats().await {
290        Ok(_stats) => ServiceHealthResult {
291            status: "healthy".to_string(),
292            response_time_ms: start.elapsed().as_millis() as u64,
293            error: None,
294        },
295        Err(e) => {
296            tracing::warn!(error = %e, "Health check: framework error");
297            ServiceHealthResult {
298                status: "unhealthy".to_string(),
299                response_time_ms: start.elapsed().as_millis() as u64,
300                error: Some("Service check failed".to_string()),
301            }
302        }
303    }
304}
305
306async fn check_storage_health(
307    auth_framework: &std::sync::Arc<crate::AuthFramework>,
308) -> ServiceHealthResult {
309    let start = std::time::Instant::now();
310
311    // Test storage connectivity by checking if we can perform a basic operation
312    // This is a non-destructive test
313    match auth_framework.get_stats().await {
314        Ok(_) => ServiceHealthResult {
315            status: "healthy".to_string(),
316            response_time_ms: start.elapsed().as_millis() as u64,
317            error: None,
318        },
319        Err(e) => {
320            tracing::warn!(error = %e, "Health check: storage error");
321            ServiceHealthResult {
322                status: "unhealthy".to_string(),
323                response_time_ms: start.elapsed().as_millis() as u64,
324                error: Some("Service check failed".to_string()),
325            }
326        }
327    }
328}
329
330async fn check_token_manager_health(
331    auth_framework: &std::sync::Arc<crate::AuthFramework>,
332) -> ServiceHealthResult {
333    let start = std::time::Instant::now();
334
335    // Test token creation and validation (without storing)
336    let test_token = auth_framework.token_manager().create_jwt_token(
337        "health_check_user",
338        vec!["health_check".to_string()],
339        Some(std::time::Duration::from_secs(1)),
340    );
341
342    match test_token {
343        Ok(token) => {
344            // Validate the token we just created
345            match auth_framework.token_manager().validate_jwt_token(&token) {
346                Ok(_) => ServiceHealthResult {
347                    status: "healthy".to_string(),
348                    response_time_ms: start.elapsed().as_millis() as u64,
349                    error: None,
350                },
351                Err(e) => {
352                    tracing::warn!(error = %e, "Health check: token validation error");
353                    ServiceHealthResult {
354                        status: "unhealthy".to_string(),
355                        response_time_ms: start.elapsed().as_millis() as u64,
356                        error: Some("Service check failed".to_string()),
357                    }
358                }
359            }
360        }
361        Err(e) => {
362            tracing::warn!(error = %e, "Health check: token creation error");
363            ServiceHealthResult {
364                status: "unhealthy".to_string(),
365                response_time_ms: start.elapsed().as_millis() as u64,
366                error: Some("Service check failed".to_string()),
367            }
368        }
369    }
370}
371
372async fn check_memory_health() -> ServiceHealthResult {
373    let start = std::time::Instant::now();
374
375    // Simple memory allocation test
376    let test_vec: Vec<u8> = vec![0; 1024]; // 1KB test allocation
377
378    ServiceHealthResult {
379        status: if test_vec.len() == 1024 {
380            "healthy".to_string()
381        } else {
382            "unhealthy".to_string()
383        },
384        response_time_ms: start.elapsed().as_millis() as u64,
385        error: None,
386    }
387}
388
389async fn get_uptime() -> String {
390    use std::time::SystemTime;
391
392    // This is a simplified uptime calculation
393    // In a real implementation, you would track the actual start time
394    static START_TIME: std::sync::OnceLock<SystemTime> = std::sync::OnceLock::new();
395    let start_time = START_TIME.get_or_init(SystemTime::now);
396
397    match start_time.elapsed() {
398        Ok(duration) => {
399            let seconds = duration.as_secs();
400            let days = seconds / 86400;
401            let hours = (seconds % 86400) / 3600;
402            let minutes = (seconds % 3600) / 60;
403
404            if days > 0 {
405                format!("{} days, {} hours, {} minutes", days, hours, minutes)
406            } else if hours > 0 {
407                format!("{} hours, {} minutes", hours, minutes)
408            } else {
409                format!("{} minutes", minutes)
410            }
411        }
412        Err(_) => "Unknown".to_string(),
413    }
414}
415
416async fn get_memory_info() -> MemoryInfo {
417    use sysinfo::System;
418    let mut sys = System::new();
419    sys.refresh_memory();
420
421    let total_mb = sys.total_memory() / (1024 * 1024);
422    let used_mb = sys.used_memory() / (1024 * 1024);
423    let free_mb = sys.available_memory() / (1024 * 1024);
424    let usage_percent = if total_mb > 0 {
425        (used_mb as f64 / total_mb as f64) * 100.0
426    } else {
427        0.0
428    };
429
430    MemoryInfo {
431        total_mb,
432        used_mb,
433        free_mb,
434        usage_percent,
435    }
436}
437
438async fn get_cpu_usage() -> f64 {
439    use sysinfo::System;
440    let mut sys = System::new();
441    sys.refresh_cpu_all();
442    // sysinfo needs a short delay between refreshes for meaningful CPU data.
443    tokio::time::sleep(std::time::Duration::from_millis(200)).await;
444    sys.refresh_cpu_all();
445    sys.global_cpu_usage() as f64
446}
447
448async fn get_disk_info() -> DiskInfo {
449    use sysinfo::Disks;
450    let disks = Disks::new_with_refreshed_list();
451    let (mut total, mut used) = (0u64, 0u64);
452    for disk in disks.list() {
453        total += disk.total_space();
454        used += disk.total_space() - disk.available_space();
455    }
456    let total_gb = total / (1024 * 1024 * 1024);
457    let used_gb = used / (1024 * 1024 * 1024);
458    let free_gb = total_gb.saturating_sub(used_gb);
459    let usage_percent = if total_gb > 0 {
460        (used_gb as f64 / total_gb as f64) * 100.0
461    } else {
462        0.0
463    };
464
465    DiskInfo {
466        total_gb,
467        used_gb,
468        free_gb,
469        usage_percent,
470    }
471}
472
473async fn get_network_info() -> NetworkInfo {
474    use sysinfo::Networks;
475    let networks = Networks::new_with_refreshed_list();
476    let (mut sent, mut received) = (0u64, 0u64);
477    for data in networks.list().values() {
478        sent += data.total_transmitted();
479        received += data.total_received();
480    }
481
482    NetworkInfo {
483        requests_per_minute: 0, // Application-level metric; not available from OS counters.
484        active_connections: 0,  // Application-level metric; not available from OS counters.
485        bytes_sent: sent,
486        bytes_received: received,
487    }
488}
489
490#[derive(Debug)]
491struct ServiceHealthResult {
492    pub status: String,
493    pub response_time_ms: u64,
494    pub error: Option<String>,
495}