saorsa_core/health/
mod.rs

1// Copyright 2024 Saorsa Labs Limited
2//
3// This software is dual-licensed under:
4// - GNU Affero General Public License v3.0 or later (AGPL-3.0-or-later)
5// - Commercial License
6//
7// For AGPL-3.0 license, see LICENSE-AGPL-3.0
8// For commercial licensing, contact: saorsalabs@gmail.com
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under these licenses is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
14//! Health check system for P2P Foundation
15//!
16//! This module provides comprehensive health monitoring with:
17//! - HTTP endpoints for liveness and readiness checks
18//! - Prometheus-compatible metrics export
19//! - Component-level health status
20//! - Debug information endpoints
21//! - Sub-100ms response times
22
23use crate::Result;
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26use std::sync::Arc;
27use std::time::{Duration, Instant, SystemTime};
28use tokio::sync::RwLock;
29
30mod checks;
31mod endpoints;
32mod metrics;
33
34pub use checks::{
35    ComponentChecker, CompositeHealthChecker, DhtHealthChecker, NetworkHealthChecker,
36    PeerHealthChecker, ResourceHealthChecker, StorageHealthChecker, TransportHealthChecker,
37};
38pub use endpoints::{HealthEndpoints, HealthServer};
39pub use metrics::{HealthMetrics, PrometheusExporter};
40
41/// Health status for a component
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
43#[serde(rename_all = "lowercase")]
44pub enum HealthStatus {
45    /// Component is healthy and functioning normally
46    Healthy,
47    /// Component is degraded but still functional
48    Degraded,
49    /// Component is unhealthy and not functioning
50    Unhealthy,
51}
52
53impl HealthStatus {
54    /// Check if the status indicates the component is operational
55    pub fn is_operational(&self) -> bool {
56        matches!(self, HealthStatus::Healthy | HealthStatus::Degraded)
57    }
58
59    /// Get string representation for serialization
60    pub fn as_str(&self) -> &'static str {
61        match self {
62            HealthStatus::Healthy => "healthy",
63            HealthStatus::Degraded => "degraded",
64            HealthStatus::Unhealthy => "unhealthy",
65        }
66    }
67}
68
69/// Health information for a single component
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct ComponentHealth {
72    /// Current status of the component
73    pub status: HealthStatus,
74    /// Response time in milliseconds
75    pub latency_ms: u64,
76    /// Optional error message if unhealthy
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub error: Option<String>,
79    /// Additional metadata about the component
80    #[serde(skip_serializing_if = "HashMap::is_empty")]
81    pub metadata: HashMap<String, serde_json::Value>,
82}
83
84impl ComponentHealth {
85    /// Create a healthy component status
86    pub fn healthy(latency_ms: u64) -> Self {
87        Self {
88            status: HealthStatus::Healthy,
89            latency_ms,
90            error: None,
91            metadata: HashMap::new(),
92        }
93    }
94
95    /// Create an unhealthy component status
96    pub fn unhealthy(latency_ms: u64, error: String) -> Self {
97        Self {
98            status: HealthStatus::Unhealthy,
99            latency_ms,
100            error: Some(error),
101            metadata: HashMap::new(),
102        }
103    }
104
105    /// Create a degraded component status
106    pub fn degraded(latency_ms: u64, error: Option<String>) -> Self {
107        Self {
108            status: HealthStatus::Degraded,
109            latency_ms,
110            error,
111            metadata: HashMap::new(),
112        }
113    }
114
115    /// Add metadata to the component health
116    pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
117        self.metadata.insert(key.into(), value);
118        self
119    }
120}
121
122/// Overall health response
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct HealthResponse {
125    /// Overall status (healthy, degraded, or unhealthy)
126    pub status: String,
127    /// Version of the P2P node
128    pub version: String,
129    /// Uptime duration
130    pub uptime: Duration,
131    /// Individual component health checks
132    pub checks: HashMap<String, ComponentHealth>,
133    /// Timestamp of the health check
134    pub timestamp: SystemTime,
135}
136
137impl HealthResponse {
138    /// Create a new health response
139    pub fn new(version: String, uptime: Duration) -> Self {
140        Self {
141            status: "healthy".to_string(),
142            version,
143            uptime,
144            checks: HashMap::new(),
145            timestamp: SystemTime::now(),
146        }
147    }
148
149    /// Add a component health check result
150    pub fn add_check(&mut self, name: impl Into<String>, health: ComponentHealth) {
151        // Update overall status based on component health
152        if health.status == HealthStatus::Unhealthy {
153            self.status = "unhealthy".to_string();
154        } else if health.status == HealthStatus::Degraded && self.status != "unhealthy" {
155            self.status = "degraded".to_string();
156        }
157
158        self.checks.insert(name.into(), health);
159    }
160
161    /// Check if the system is ready to serve traffic
162    pub fn is_ready(&self) -> bool {
163        self.status != "unhealthy"
164    }
165
166    /// Check if the system is alive (basic liveness)
167    pub fn is_alive(&self) -> bool {
168        // System is alive if we can generate a response
169        true
170    }
171}
172
173/// Debug information response
174#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct DebugInfo {
176    /// System information
177    pub system: SystemInfo,
178    /// Runtime information
179    pub runtime: RuntimeInfo,
180    /// Component details
181    pub components: HashMap<String, serde_json::Value>,
182}
183
184/// System information for debug endpoint
185#[derive(Debug, Clone, Serialize, Deserialize)]
186pub struct SystemInfo {
187    /// Operating system
188    pub os: String,
189    /// Architecture
190    pub arch: String,
191    /// Number of CPUs
192    pub cpu_count: usize,
193    /// Total memory in bytes
194    pub total_memory: u64,
195    /// Available memory in bytes
196    pub available_memory: u64,
197}
198
199/// Runtime information for debug endpoint
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct RuntimeInfo {
202    /// Rust version
203    pub rust_version: String,
204    /// Number of active threads
205    pub thread_count: usize,
206    /// Current memory usage in bytes
207    pub memory_usage: u64,
208    /// Uptime duration
209    pub uptime: Duration,
210}
211
212/// Health check manager
213pub struct HealthManager {
214    /// Start time for uptime calculation
215    start_time: Instant,
216    /// Version string
217    version: String,
218    /// Component checkers
219    checkers: Arc<RwLock<HashMap<String, Box<dyn ComponentChecker>>>>,
220    /// Cached health response
221    cached_response: Arc<RwLock<Option<(Instant, HealthResponse)>>>,
222    /// Cache duration
223    cache_duration: Duration,
224}
225
226impl HealthManager {
227    /// Create a new health manager
228    pub fn new(version: String) -> Self {
229        Self {
230            start_time: Instant::now(),
231            version,
232            checkers: Arc::new(RwLock::new(HashMap::new())),
233            cached_response: Arc::new(RwLock::new(None)),
234            cache_duration: Duration::from_millis(100), // Cache for 100ms
235        }
236    }
237
238    /// Register a component health checker
239    pub async fn register_checker(
240        &self,
241        name: impl Into<String>,
242        checker: Box<dyn ComponentChecker>,
243    ) {
244        let mut checkers = self.checkers.write().await;
245        checkers.insert(name.into(), checker);
246    }
247
248    /// Get current health status
249    pub async fn get_health(&self) -> Result<HealthResponse> {
250        // Check cache first
251        {
252            let cache = self.cached_response.read().await;
253            if let Some((cached_at, ref response)) = *cache
254                && cached_at.elapsed() < self.cache_duration
255            {
256                return Ok(response.clone());
257            }
258        }
259
260        // Perform health checks
261        let uptime = self.start_time.elapsed();
262        let mut response = HealthResponse::new(self.version.clone(), uptime);
263
264        let checkers = self.checkers.read().await;
265        for (name, checker) in checkers.iter() {
266            let start = Instant::now();
267            let health = match checker.check().await {
268                Ok(status) => {
269                    let latency_ms = start.elapsed().as_millis() as u64;
270                    match status {
271                        HealthStatus::Healthy => ComponentHealth::healthy(latency_ms),
272                        HealthStatus::Degraded => ComponentHealth::degraded(latency_ms, None),
273                        HealthStatus::Unhealthy => {
274                            ComponentHealth::unhealthy(latency_ms, "Check failed".to_string())
275                        }
276                    }
277                }
278                Err(e) => {
279                    let latency_ms = start.elapsed().as_millis() as u64;
280                    ComponentHealth::unhealthy(latency_ms, e.to_string())
281                }
282            };
283            response.add_check(name, health);
284        }
285
286        // Update cache
287        {
288            let mut cache = self.cached_response.write().await;
289            *cache = Some((Instant::now(), response.clone()));
290        }
291
292        Ok(response)
293    }
294
295    /// Get debug information
296    pub async fn get_debug_info(&self) -> Result<DebugInfo> {
297        let system = SystemInfo {
298            os: std::env::consts::OS.to_string(),
299            arch: std::env::consts::ARCH.to_string(),
300            cpu_count: num_cpus::get(),
301            total_memory: Self::get_total_memory(),
302            available_memory: Self::get_available_memory(),
303        };
304
305        let runtime = RuntimeInfo {
306            rust_version: env!("CARGO_PKG_VERSION").to_string(), // Use package version instead
307            thread_count: Self::get_thread_count(),
308            memory_usage: Self::get_memory_usage(),
309            uptime: self.start_time.elapsed(),
310        };
311
312        let mut components = HashMap::new();
313        let checkers = self.checkers.read().await;
314        for (name, checker) in checkers.iter() {
315            if let Some(debug_info) = checker.debug_info().await {
316                components.insert(name.clone(), debug_info);
317            }
318        }
319
320        Ok(DebugInfo {
321            system,
322            runtime,
323            components,
324        })
325    }
326
327    /// Get total system memory (stub implementation)
328    fn get_total_memory() -> u64 {
329        // In a real implementation, use sysinfo crate
330        8 * 1024 * 1024 * 1024 // 8GB default
331    }
332
333    /// Get available system memory (stub implementation)
334    fn get_available_memory() -> u64 {
335        // In a real implementation, use sysinfo crate
336        4 * 1024 * 1024 * 1024 // 4GB default
337    }
338
339    /// Get current thread count (stub implementation)
340    fn get_thread_count() -> usize {
341        // In a real implementation, use std::thread or sysinfo
342        4
343    }
344
345    /// Get current memory usage (stub implementation)
346    fn get_memory_usage() -> u64 {
347        // In a real implementation, use jemalloc stats or similar
348        100 * 1024 * 1024 // 100MB default
349    }
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    #[test]
357    fn test_health_status() {
358        assert!(HealthStatus::Healthy.is_operational());
359        assert!(HealthStatus::Degraded.is_operational());
360        assert!(!HealthStatus::Unhealthy.is_operational());
361
362        assert_eq!(HealthStatus::Healthy.as_str(), "healthy");
363        assert_eq!(HealthStatus::Degraded.as_str(), "degraded");
364        assert_eq!(HealthStatus::Unhealthy.as_str(), "unhealthy");
365    }
366
367    #[test]
368    fn test_component_health() {
369        let health = ComponentHealth::healthy(10);
370        assert_eq!(health.status, HealthStatus::Healthy);
371        assert_eq!(health.latency_ms, 10);
372        assert!(health.error.is_none());
373
374        let health = ComponentHealth::unhealthy(20, "Connection failed".to_string());
375        assert_eq!(health.status, HealthStatus::Unhealthy);
376        assert_eq!(health.latency_ms, 20);
377        assert_eq!(health.error.as_deref(), Some("Connection failed"));
378
379        let health = ComponentHealth::degraded(15, Some("High latency".to_string()))
380            .with_metadata("connections", serde_json::json!(95));
381        assert_eq!(health.status, HealthStatus::Degraded);
382        assert_eq!(
383            health.metadata.get("connections"),
384            Some(&serde_json::json!(95))
385        );
386    }
387
388    #[test]
389    fn test_health_response() {
390        let mut response = HealthResponse::new("1.0.0".to_string(), Duration::from_secs(3600));
391        assert_eq!(response.status, "healthy");
392        assert!(response.is_ready());
393        assert!(response.is_alive());
394
395        // Add healthy component
396        response.add_check("network", ComponentHealth::healthy(5));
397        assert_eq!(response.status, "healthy");
398
399        // Add degraded component
400        response.add_check("dht", ComponentHealth::degraded(50, None));
401        assert_eq!(response.status, "degraded");
402        assert!(response.is_ready());
403
404        // Add unhealthy component
405        response.add_check(
406            "storage",
407            ComponentHealth::unhealthy(100, "Disk full".to_string()),
408        );
409        assert_eq!(response.status, "unhealthy");
410        assert!(!response.is_ready());
411        assert!(response.is_alive()); // Still alive even if unhealthy
412    }
413
414    #[tokio::test]
415    async fn test_health_manager() {
416        let manager = HealthManager::new("1.0.0".to_string());
417
418        // Get health without any checkers
419        let health = manager.get_health().await.unwrap();
420        assert_eq!(health.status, "healthy");
421        assert!(health.checks.is_empty());
422
423        // Test caching
424        let health2 = manager.get_health().await.unwrap();
425        assert_eq!(health.timestamp, health2.timestamp); // Should be cached
426    }
427
428    #[test]
429    fn test_debug_info_structure() {
430        let system = SystemInfo {
431            os: "linux".to_string(),
432            arch: "x86_64".to_string(),
433            cpu_count: 8,
434            total_memory: 16 * 1024 * 1024 * 1024,
435            available_memory: 8 * 1024 * 1024 * 1024,
436        };
437
438        let runtime = RuntimeInfo {
439            rust_version: "1.75.0".to_string(),
440            thread_count: 10,
441            memory_usage: 500 * 1024 * 1024,
442            uptime: Duration::from_secs(7200),
443        };
444
445        let debug_info = DebugInfo {
446            system,
447            runtime,
448            components: HashMap::new(),
449        };
450
451        // Verify serialization works
452        let json = serde_json::to_string(&debug_info).unwrap();
453        assert!(json.contains("linux"));
454        assert!(json.contains("x86_64"));
455    }
456}