1use crate::Result;
24use serde::{Deserialize, Serialize};
25use std::collections::HashMap;
26use std::sync::Arc;
27use std::time::{Duration, Instant, SystemTime};
28use tokio::sync::RwLock;
29
30mod checks;
31mod endpoints;
32mod metrics;
33
34pub use checks::{
35 ComponentChecker, CompositeHealthChecker, DhtHealthChecker, NetworkHealthChecker,
36 PeerHealthChecker, ResourceHealthChecker, StorageHealthChecker, TransportHealthChecker,
37};
38pub use endpoints::{HealthEndpoints, HealthServer};
39pub use metrics::{HealthMetrics, PrometheusExporter};
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
43#[serde(rename_all = "lowercase")]
44pub enum HealthStatus {
45 Healthy,
47 Degraded,
49 Unhealthy,
51}
52
53impl HealthStatus {
54 pub fn is_operational(&self) -> bool {
56 matches!(self, HealthStatus::Healthy | HealthStatus::Degraded)
57 }
58
59 pub fn as_str(&self) -> &'static str {
61 match self {
62 HealthStatus::Healthy => "healthy",
63 HealthStatus::Degraded => "degraded",
64 HealthStatus::Unhealthy => "unhealthy",
65 }
66 }
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct ComponentHealth {
72 pub status: HealthStatus,
74 pub latency_ms: u64,
76 #[serde(skip_serializing_if = "Option::is_none")]
78 pub error: Option<String>,
79 #[serde(skip_serializing_if = "HashMap::is_empty")]
81 pub metadata: HashMap<String, serde_json::Value>,
82}
83
84impl ComponentHealth {
85 pub fn healthy(latency_ms: u64) -> Self {
87 Self {
88 status: HealthStatus::Healthy,
89 latency_ms,
90 error: None,
91 metadata: HashMap::new(),
92 }
93 }
94
95 pub fn unhealthy(latency_ms: u64, error: String) -> Self {
97 Self {
98 status: HealthStatus::Unhealthy,
99 latency_ms,
100 error: Some(error),
101 metadata: HashMap::new(),
102 }
103 }
104
105 pub fn degraded(latency_ms: u64, error: Option<String>) -> Self {
107 Self {
108 status: HealthStatus::Degraded,
109 latency_ms,
110 error,
111 metadata: HashMap::new(),
112 }
113 }
114
115 pub fn with_metadata(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
117 self.metadata.insert(key.into(), value);
118 self
119 }
120}
121
122#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct HealthResponse {
125 pub status: String,
127 pub version: String,
129 pub uptime: Duration,
131 pub checks: HashMap<String, ComponentHealth>,
133 pub timestamp: SystemTime,
135}
136
137impl HealthResponse {
138 pub fn new(version: String, uptime: Duration) -> Self {
140 Self {
141 status: "healthy".to_string(),
142 version,
143 uptime,
144 checks: HashMap::new(),
145 timestamp: SystemTime::now(),
146 }
147 }
148
149 pub fn add_check(&mut self, name: impl Into<String>, health: ComponentHealth) {
151 if health.status == HealthStatus::Unhealthy {
153 self.status = "unhealthy".to_string();
154 } else if health.status == HealthStatus::Degraded && self.status != "unhealthy" {
155 self.status = "degraded".to_string();
156 }
157
158 self.checks.insert(name.into(), health);
159 }
160
161 pub fn is_ready(&self) -> bool {
163 self.status != "unhealthy"
164 }
165
166 pub fn is_alive(&self) -> bool {
168 true
170 }
171}
172
173#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct DebugInfo {
176 pub system: SystemInfo,
178 pub runtime: RuntimeInfo,
180 pub components: HashMap<String, serde_json::Value>,
182}
183
184#[derive(Debug, Clone, Serialize, Deserialize)]
186pub struct SystemInfo {
187 pub os: String,
189 pub arch: String,
191 pub cpu_count: usize,
193 pub total_memory: u64,
195 pub available_memory: u64,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct RuntimeInfo {
202 pub rust_version: String,
204 pub thread_count: usize,
206 pub memory_usage: u64,
208 pub uptime: Duration,
210}
211
212pub struct HealthManager {
214 start_time: Instant,
216 version: String,
218 checkers: Arc<RwLock<HashMap<String, Box<dyn ComponentChecker>>>>,
220 cached_response: Arc<RwLock<Option<(Instant, HealthResponse)>>>,
222 cache_duration: Duration,
224}
225
226impl HealthManager {
227 pub fn new(version: String) -> Self {
229 Self {
230 start_time: Instant::now(),
231 version,
232 checkers: Arc::new(RwLock::new(HashMap::new())),
233 cached_response: Arc::new(RwLock::new(None)),
234 cache_duration: Duration::from_millis(100), }
236 }
237
238 pub async fn register_checker(
240 &self,
241 name: impl Into<String>,
242 checker: Box<dyn ComponentChecker>,
243 ) {
244 let mut checkers = self.checkers.write().await;
245 checkers.insert(name.into(), checker);
246 }
247
248 pub async fn get_health(&self) -> Result<HealthResponse> {
250 {
252 let cache = self.cached_response.read().await;
253 if let Some((cached_at, ref response)) = *cache
254 && cached_at.elapsed() < self.cache_duration
255 {
256 return Ok(response.clone());
257 }
258 }
259
260 let uptime = self.start_time.elapsed();
262 let mut response = HealthResponse::new(self.version.clone(), uptime);
263
264 let checkers = self.checkers.read().await;
265 for (name, checker) in checkers.iter() {
266 let start = Instant::now();
267 let health = match checker.check().await {
268 Ok(status) => {
269 let latency_ms = start.elapsed().as_millis() as u64;
270 match status {
271 HealthStatus::Healthy => ComponentHealth::healthy(latency_ms),
272 HealthStatus::Degraded => ComponentHealth::degraded(latency_ms, None),
273 HealthStatus::Unhealthy => {
274 ComponentHealth::unhealthy(latency_ms, "Check failed".to_string())
275 }
276 }
277 }
278 Err(e) => {
279 let latency_ms = start.elapsed().as_millis() as u64;
280 ComponentHealth::unhealthy(latency_ms, e.to_string())
281 }
282 };
283 response.add_check(name, health);
284 }
285
286 {
288 let mut cache = self.cached_response.write().await;
289 *cache = Some((Instant::now(), response.clone()));
290 }
291
292 Ok(response)
293 }
294
295 pub async fn get_debug_info(&self) -> Result<DebugInfo> {
297 let system = SystemInfo {
298 os: std::env::consts::OS.to_string(),
299 arch: std::env::consts::ARCH.to_string(),
300 cpu_count: num_cpus::get(),
301 total_memory: Self::get_total_memory(),
302 available_memory: Self::get_available_memory(),
303 };
304
305 let runtime = RuntimeInfo {
306 rust_version: env!("CARGO_PKG_VERSION").to_string(), thread_count: Self::get_thread_count(),
308 memory_usage: Self::get_memory_usage(),
309 uptime: self.start_time.elapsed(),
310 };
311
312 let mut components = HashMap::new();
313 let checkers = self.checkers.read().await;
314 for (name, checker) in checkers.iter() {
315 if let Some(debug_info) = checker.debug_info().await {
316 components.insert(name.clone(), debug_info);
317 }
318 }
319
320 Ok(DebugInfo {
321 system,
322 runtime,
323 components,
324 })
325 }
326
327 fn get_total_memory() -> u64 {
329 8 * 1024 * 1024 * 1024 }
332
333 fn get_available_memory() -> u64 {
335 4 * 1024 * 1024 * 1024 }
338
339 fn get_thread_count() -> usize {
341 4
343 }
344
345 fn get_memory_usage() -> u64 {
347 100 * 1024 * 1024 }
350}
351
352#[cfg(test)]
353mod tests {
354 use super::*;
355
356 #[test]
357 fn test_health_status() {
358 assert!(HealthStatus::Healthy.is_operational());
359 assert!(HealthStatus::Degraded.is_operational());
360 assert!(!HealthStatus::Unhealthy.is_operational());
361
362 assert_eq!(HealthStatus::Healthy.as_str(), "healthy");
363 assert_eq!(HealthStatus::Degraded.as_str(), "degraded");
364 assert_eq!(HealthStatus::Unhealthy.as_str(), "unhealthy");
365 }
366
367 #[test]
368 fn test_component_health() {
369 let health = ComponentHealth::healthy(10);
370 assert_eq!(health.status, HealthStatus::Healthy);
371 assert_eq!(health.latency_ms, 10);
372 assert!(health.error.is_none());
373
374 let health = ComponentHealth::unhealthy(20, "Connection failed".to_string());
375 assert_eq!(health.status, HealthStatus::Unhealthy);
376 assert_eq!(health.latency_ms, 20);
377 assert_eq!(health.error.as_deref(), Some("Connection failed"));
378
379 let health = ComponentHealth::degraded(15, Some("High latency".to_string()))
380 .with_metadata("connections", serde_json::json!(95));
381 assert_eq!(health.status, HealthStatus::Degraded);
382 assert_eq!(
383 health.metadata.get("connections"),
384 Some(&serde_json::json!(95))
385 );
386 }
387
388 #[test]
389 fn test_health_response() {
390 let mut response = HealthResponse::new("1.0.0".to_string(), Duration::from_secs(3600));
391 assert_eq!(response.status, "healthy");
392 assert!(response.is_ready());
393 assert!(response.is_alive());
394
395 response.add_check("network", ComponentHealth::healthy(5));
397 assert_eq!(response.status, "healthy");
398
399 response.add_check("dht", ComponentHealth::degraded(50, None));
401 assert_eq!(response.status, "degraded");
402 assert!(response.is_ready());
403
404 response.add_check(
406 "storage",
407 ComponentHealth::unhealthy(100, "Disk full".to_string()),
408 );
409 assert_eq!(response.status, "unhealthy");
410 assert!(!response.is_ready());
411 assert!(response.is_alive()); }
413
414 #[tokio::test]
415 async fn test_health_manager() {
416 let manager = HealthManager::new("1.0.0".to_string());
417
418 let health = manager.get_health().await.unwrap();
420 assert_eq!(health.status, "healthy");
421 assert!(health.checks.is_empty());
422
423 let health2 = manager.get_health().await.unwrap();
425 assert_eq!(health.timestamp, health2.timestamp); }
427
428 #[test]
429 fn test_debug_info_structure() {
430 let system = SystemInfo {
431 os: "linux".to_string(),
432 arch: "x86_64".to_string(),
433 cpu_count: 8,
434 total_memory: 16 * 1024 * 1024 * 1024,
435 available_memory: 8 * 1024 * 1024 * 1024,
436 };
437
438 let runtime = RuntimeInfo {
439 rust_version: "1.75.0".to_string(),
440 thread_count: 10,
441 memory_usage: 500 * 1024 * 1024,
442 uptime: Duration::from_secs(7200),
443 };
444
445 let debug_info = DebugInfo {
446 system,
447 runtime,
448 components: HashMap::new(),
449 };
450
451 let json = serde_json::to_string(&debug_info).unwrap();
453 assert!(json.contains("linux"));
454 assert!(json.contains("x86_64"));
455 }
456}