elara_runtime/
health.rs

1//! Health Check System for ELARA Runtime
2//!
3//! This module provides a production-grade health checking system with:
4//! - Pluggable health checks via the `HealthCheck` trait
5//! - Result caching with configurable TTL to avoid excessive checking
6//! - Proper aggregation logic (Unhealthy > Degraded > Healthy)
7//! - Thread-safe operation with Arc<RwLock<>>
8//! - Support for Kubernetes liveness and readiness probes
9//! - Four built-in health checks for common monitoring needs
10//!
11//! # Architecture
12//!
13//! The health check system consists of:
14//! - `HealthCheck` trait: Defines the interface for individual health checks
15//! - `HealthChecker`: Orchestrates multiple health checks and caches results
16//! - `HealthStatus`: Aggregated health status with individual check results
17//! - `HealthCheckResult`: Result of an individual health check
18//!
19//! # Built-in Health Checks
20//!
21//! The module provides four production-ready health checks:
22//!
23//! ## 1. ConnectionHealthCheck
24//!
25//! Monitors the number of active connections to ensure the node is properly
26//! connected to the network. Returns `Degraded` if the connection count falls
27//! below the configured minimum.
28//!
29//! **Use case:** Detect network isolation or connectivity issues
30//!
31//! ## 2. MemoryHealthCheck
32//!
33//! Monitors process memory usage using the `sysinfo` crate to obtain real
34//! system metrics. Returns `Unhealthy` if memory usage exceeds the configured
35//! maximum, which helps prevent OOM kills and performance degradation.
36//!
37//! **Use case:** Detect memory leaks or excessive memory consumption
38//!
39//! ## 3. TimeDriftCheck
40//!
41//! Monitors time drift between the local node and network consensus time.
42//! Returns `Degraded` if drift exceeds the configured threshold. Excessive
43//! time drift can cause synchronization issues and state divergence.
44//!
45//! **Use case:** Detect clock synchronization issues
46//!
47//! ## 4. StateDivergenceCheck
48//!
49//! Monitors the state reconciliation engine to ensure state is converging
50//! properly. Returns `Degraded` if the number of pending events exceeds
51//! the configured threshold, which may indicate network partitions or
52//! reconciliation issues.
53//!
54//! **Use case:** Detect state convergence problems
55//!
56//! # Example
57//!
58//! ```rust,no_run
59//! use elara_runtime::health::{
60//!     HealthChecker, HealthCheck, HealthCheckResult,
61//!     ConnectionHealthCheck, MemoryHealthCheck, TimeDriftCheck, StateDivergenceCheck
62//! };
63//! use elara_runtime::node::Node;
64//! use std::sync::Arc;
65//! use std::time::Duration;
66//!
67//! // Create a node
68//! let node = Arc::new(Node::new());
69//!
70//! // Create health checker with 30-second cache
71//! let mut checker = HealthChecker::new(Duration::from_secs(30));
72//!
73//! // Add built-in health checks
74//! checker.add_check(Box::new(ConnectionHealthCheck::new(node.clone(), 3)));
75//! checker.add_check(Box::new(MemoryHealthCheck::new(1800))); // 1800 MB
76//! checker.add_check(Box::new(TimeDriftCheck::new(node.clone(), 100))); // 100ms
77//! checker.add_check(Box::new(StateDivergenceCheck::new(node)));
78//!
79//! // Check health
80//! let status = checker.check_health();
81//!
82//! if status.is_healthy() {
83//!     println!("All systems operational");
84//! } else if status.is_degraded() {
85//!     println!("System degraded: {:?}", status.overall.reason());
86//! } else {
87//!     println!("System unhealthy: {:?}", status.overall.reason());
88//! }
89//! ```
90//!
91//! # Production Deployment
92//!
93//! In production, health checks are typically exposed via HTTP endpoints:
94//!
95//! - `/health` - Overall health status (200 OK if healthy/degraded, 503 if unhealthy)
96//! - `/ready` - Readiness probe for Kubernetes (checks if node can accept traffic)
97//! - `/live` - Liveness probe for Kubernetes (checks if node should be restarted)
98//!
99//! Configure thresholds based on your deployment:
100//!
101//! - **Small deployment (10 nodes)**: min_connections=2, max_memory_mb=1000
102//! - **Medium deployment (100 nodes)**: min_connections=5, max_memory_mb=2000
103//! - **Large deployment (1000 nodes)**: min_connections=10, max_memory_mb=4000
104
105use std::collections::HashMap;
106use std::sync::Arc;
107use std::time::{Duration, Instant};
108use parking_lot::RwLock;
109
110/// Result of an individual health check.
111///
112/// Health checks can return one of three states:
113/// - `Healthy`: The component is functioning normally
114/// - `Degraded`: The component is functioning but with reduced capacity or performance
115/// - `Unhealthy`: The component is not functioning correctly
116///
117/// The overall system health is determined by the worst individual check result,
118/// with the precedence: Unhealthy > Degraded > Healthy
119#[derive(Debug, Clone, PartialEq, Eq)]
120pub enum HealthCheckResult {
121    /// The component is functioning normally
122    Healthy,
123    
124    /// The component is functioning but degraded
125    /// 
126    /// This typically indicates reduced capacity, elevated latency,
127    /// or other non-critical issues that don't prevent operation.
128    Degraded {
129        /// Human-readable reason for the degraded state
130        reason: String,
131    },
132    
133    /// The component is not functioning correctly
134    ///
135    /// This indicates a critical issue that prevents normal operation.
136    Unhealthy {
137        /// Human-readable reason for the unhealthy state
138        reason: String,
139    },
140}
141
142impl HealthCheckResult {
143    /// Returns true if the result is Healthy
144    pub fn is_healthy(&self) -> bool {
145        matches!(self, HealthCheckResult::Healthy)
146    }
147    
148    /// Returns true if the result is Degraded
149    pub fn is_degraded(&self) -> bool {
150        matches!(self, HealthCheckResult::Degraded { .. })
151    }
152    
153    /// Returns true if the result is Unhealthy
154    pub fn is_unhealthy(&self) -> bool {
155        matches!(self, HealthCheckResult::Unhealthy { .. })
156    }
157    
158    /// Returns the severity level for ordering (higher is worse)
159    fn severity(&self) -> u8 {
160        match self {
161            HealthCheckResult::Healthy => 0,
162            HealthCheckResult::Degraded { .. } => 1,
163            HealthCheckResult::Unhealthy { .. } => 2,
164        }
165    }
166    
167    /// Returns the reason string if available
168    pub fn reason(&self) -> Option<&str> {
169        match self {
170            HealthCheckResult::Healthy => None,
171            HealthCheckResult::Degraded { reason } => Some(reason),
172            HealthCheckResult::Unhealthy { reason } => Some(reason),
173        }
174    }
175}
176
177/// Trait for implementing custom health checks.
178///
179/// Health checks must be `Send + Sync` to allow concurrent execution
180/// across threads. The `check()` method should be relatively fast
181/// (ideally < 10ms) to avoid blocking the health check endpoint.
182///
183/// # Example
184///
185/// ```rust
186/// use elara_runtime::health::{HealthCheck, HealthCheckResult};
187///
188/// struct DatabaseHealthCheck {
189///     connection_pool: Arc<ConnectionPool>,
190/// }
191///
192/// impl HealthCheck for DatabaseHealthCheck {
193///     fn name(&self) -> &str {
194///         "database"
195///     }
196///
197///     fn check(&self) -> HealthCheckResult {
198///         match self.connection_pool.ping() {
199///             Ok(_) => HealthCheckResult::Healthy,
200///             Err(e) => HealthCheckResult::Unhealthy {
201///                 reason: format!("Database ping failed: {}", e),
202///             },
203///         }
204///     }
205/// }
206/// ```
207pub trait HealthCheck: Send + Sync {
208    /// Returns the name of this health check.
209    ///
210    /// The name should be unique within a `HealthChecker` and should
211    /// be a valid identifier (lowercase, alphanumeric, underscores).
212    fn name(&self) -> &str;
213    
214    /// Performs the health check and returns the result.
215    ///
216    /// This method should be relatively fast (< 10ms ideally) to avoid
217    /// blocking the health check endpoint. For expensive checks, consider
218    /// running them in the background and caching the result.
219    fn check(&self) -> HealthCheckResult;
220}
221
222/// Aggregated health status containing overall status and individual check results.
223///
224/// The overall status is determined by the worst individual check result:
225/// - If any check is Unhealthy, overall is Unhealthy
226/// - Else if any check is Degraded, overall is Degraded
227/// - Else overall is Healthy
228#[derive(Debug, Clone)]
229pub struct HealthStatus {
230    /// Overall health status (worst of all checks)
231    pub overall: HealthCheckResult,
232    
233    /// Individual health check results by name
234    pub checks: HashMap<String, HealthCheckResult>,
235    
236    /// Timestamp when this status was computed
237    pub timestamp: Instant,
238}
239
240impl HealthStatus {
241    /// Creates a new HealthStatus with the given checks
242    fn new(checks: HashMap<String, HealthCheckResult>) -> Self {
243        let overall = Self::aggregate_results(&checks);
244        Self {
245            overall,
246            checks,
247            timestamp: Instant::now(),
248        }
249    }
250    
251    /// Aggregates individual check results into an overall status.
252    ///
253    /// Precedence: Unhealthy > Degraded > Healthy
254    fn aggregate_results(checks: &HashMap<String, HealthCheckResult>) -> HealthCheckResult {
255        if checks.is_empty() {
256            return HealthCheckResult::Healthy;
257        }
258        
259        // Find the worst result
260        let worst = checks.values()
261            .max_by_key(|result| result.severity())
262            .unwrap(); // Safe because we checked is_empty
263        
264        worst.clone()
265    }
266    
267    /// Returns true if the overall status is Healthy
268    pub fn is_healthy(&self) -> bool {
269        self.overall.is_healthy()
270    }
271    
272    /// Returns true if the overall status is Degraded
273    pub fn is_degraded(&self) -> bool {
274        self.overall.is_degraded()
275    }
276    
277    /// Returns true if the overall status is Unhealthy
278    pub fn is_unhealthy(&self) -> bool {
279        self.overall.is_unhealthy()
280    }
281}
282
283/// Configuration for the health checker
284#[derive(Debug, Clone)]
285pub struct HealthCheckerConfig {
286    /// Time-to-live for cached health check results
287    pub cache_ttl: Duration,
288}
289
290impl Default for HealthCheckerConfig {
291    fn default() -> Self {
292        Self {
293            cache_ttl: Duration::from_secs(30),
294        }
295    }
296}
297
298/// Comprehensive health check configuration for NodeConfig.
299///
300/// This configuration enables and configures the health check system for a node.
301/// When set in `NodeConfig`, the node will automatically initialize health checks
302/// and optionally start an HTTP server to expose health endpoints.
303///
304/// # Health Check System
305///
306/// The health check system provides:
307/// - Built-in checks for connections, memory, time drift, and state convergence
308/// - Configurable thresholds for each check
309/// - HTTP endpoints for Kubernetes probes and load balancers
310/// - Result caching to minimize overhead
311///
312/// # HTTP Endpoints
313///
314/// When `server_bind_address` is set, the following endpoints are exposed:
315/// - `GET /health` - Overall health status (200 OK if healthy/degraded, 503 if unhealthy)
316/// - `GET /ready` - Readiness probe (200 OK if healthy/degraded, 503 if unhealthy)
317/// - `GET /live` - Liveness probe (200 OK if healthy/degraded, 503 if unhealthy)
318///
319/// # Example
320///
321/// ```rust,no_run
322/// use elara_runtime::health::HealthCheckConfig;
323/// use elara_runtime::node::NodeConfig;
324/// use std::time::Duration;
325///
326/// let health_config = HealthCheckConfig {
327///     enabled: true,
328///     server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
329///     cache_ttl: Duration::from_secs(30),
330///     min_connections: Some(3),
331///     max_memory_mb: Some(1800),
332///     max_time_drift_ms: Some(100),
333///     max_pending_events: Some(1000),
334/// };
335///
336/// let node_config = NodeConfig {
337///     health_checks: Some(health_config),
338///     ..Default::default()
339/// };
340/// ```
341///
342/// # Production Recommendations
343///
344/// ## Small Deployment (10 nodes)
345/// ```rust,no_run
346/// use elara_runtime::health::HealthCheckConfig;
347/// use std::time::Duration;
348///
349/// let config = HealthCheckConfig {
350///     enabled: true,
351///     server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
352///     cache_ttl: Duration::from_secs(30),
353///     min_connections: Some(2),
354///     max_memory_mb: Some(1000),
355///     max_time_drift_ms: Some(100),
356///     max_pending_events: Some(500),
357/// };
358/// ```
359///
360/// ## Medium Deployment (100 nodes)
361/// ```rust,no_run
362/// use elara_runtime::health::HealthCheckConfig;
363/// use std::time::Duration;
364///
365/// let config = HealthCheckConfig {
366///     enabled: true,
367///     server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
368///     cache_ttl: Duration::from_secs(30),
369///     min_connections: Some(5),
370///     max_memory_mb: Some(2000),
371///     max_time_drift_ms: Some(100),
372///     max_pending_events: Some(1000),
373/// };
374/// ```
375///
376/// ## Large Deployment (1000 nodes)
377/// ```rust,no_run
378/// use elara_runtime::health::HealthCheckConfig;
379/// use std::time::Duration;
380///
381/// let config = HealthCheckConfig {
382///     enabled: true,
383///     server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
384///     cache_ttl: Duration::from_secs(30),
385///     min_connections: Some(10),
386///     max_memory_mb: Some(4000),
387///     max_time_drift_ms: Some(100),
388///     max_pending_events: Some(2000),
389/// };
390/// ```
391#[derive(Debug, Clone)]
392pub struct HealthCheckConfig {
393    /// Enable or disable health checks.
394    ///
395    /// When `false`, no health checks are performed and no HTTP server is started.
396    /// This allows health checks to be completely disabled in environments where
397    /// they are not needed.
398    ///
399    /// Default: `true`
400    pub enabled: bool,
401
402    /// Optional bind address for the health check HTTP server.
403    ///
404    /// When `Some`, an HTTP server is started on this address to expose health
405    /// check endpoints (`/health`, `/ready`, `/live`). When `None`, health checks
406    /// are still performed but no HTTP server is started (useful for programmatic
407    /// health checking without exposing endpoints).
408    ///
409    /// Format: `"host:port"` (e.g., `"0.0.0.0:8080"`, `"127.0.0.1:8080"`)
410    ///
411    /// Default: `Some("0.0.0.0:8080")`
412    pub server_bind_address: Option<std::net::SocketAddr>,
413
414    /// Cache TTL for health check results.
415    ///
416    /// Health check results are cached for this duration to avoid excessive
417    /// checking overhead. Subsequent health check requests within the TTL
418    /// return cached results.
419    ///
420    /// Recommended values:
421    /// - High-frequency checks: 10-15 seconds
422    /// - Normal checks: 30 seconds
423    /// - Low-frequency checks: 60 seconds
424    ///
425    /// Default: 30 seconds
426    pub cache_ttl: Duration,
427
428    /// Minimum number of active connections for ConnectionHealthCheck.
429    ///
430    /// When `Some`, a `ConnectionHealthCheck` is registered that monitors
431    /// the number of active connections. The check returns `Degraded` if
432    /// the connection count falls below this threshold.
433    ///
434    /// When `None`, no connection health check is performed.
435    ///
436    /// Recommended values:
437    /// - Small deployment: 2-3
438    /// - Medium deployment: 5-10
439    /// - Large deployment: 10-20
440    ///
441    /// Default: `Some(3)`
442    pub min_connections: Option<usize>,
443
444    /// Maximum memory usage in megabytes for MemoryHealthCheck.
445    ///
446    /// When `Some`, a `MemoryHealthCheck` is registered that monitors
447    /// process memory usage. The check returns `Unhealthy` if memory
448    /// usage exceeds this threshold.
449    ///
450    /// When `None`, no memory health check is performed.
451    ///
452    /// Recommended values:
453    /// - Small deployment: 1000 MB (1 GB)
454    /// - Medium deployment: 2000 MB (2 GB)
455    /// - Large deployment: 4000 MB (4 GB)
456    ///
457    /// Set this to 80-90% of your container memory limit to allow for
458    /// graceful degradation before OOM kills.
459    ///
460    /// Default: `Some(1800)` (1.8 GB)
461    pub max_memory_mb: Option<usize>,
462
463    /// Maximum time drift in milliseconds for TimeDriftCheck.
464    ///
465    /// When `Some`, a `TimeDriftCheck` is registered that monitors
466    /// time drift between the local node and network consensus time.
467    /// The check returns `Degraded` if drift exceeds this threshold.
468    ///
469    /// When `None`, no time drift check is performed.
470    ///
471    /// Recommended value: 100 ms
472    ///
473    /// Excessive time drift can cause synchronization issues and state
474    /// divergence in distributed systems.
475    ///
476    /// Default: `Some(100)`
477    pub max_time_drift_ms: Option<i64>,
478
479    /// Maximum pending events for StateDivergenceCheck.
480    ///
481    /// When `Some`, a `StateDivergenceCheck` is registered that monitors
482    /// the state reconciliation engine. The check returns `Degraded` if
483    /// the number of pending events exceeds this threshold.
484    ///
485    /// When `None`, no state divergence check is performed.
486    ///
487    /// Recommended values:
488    /// - Small deployment: 500
489    /// - Medium deployment: 1000
490    /// - Large deployment: 2000
491    ///
492    /// High pending event counts may indicate network partitions or
493    /// reconciliation issues.
494    ///
495    /// Default: `Some(1000)`
496    pub max_pending_events: Option<usize>,
497}
498
499impl Default for HealthCheckConfig {
500    fn default() -> Self {
501        Self {
502            enabled: true,
503            server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
504            cache_ttl: Duration::from_secs(30),
505            min_connections: Some(3),
506            max_memory_mb: Some(1800),
507            max_time_drift_ms: Some(100),
508            max_pending_events: Some(1000),
509        }
510    }
511}
512
513impl HealthCheckConfig {
514    /// Creates a new HealthCheckConfig with all checks disabled.
515    ///
516    /// This is useful when you want to selectively enable only specific checks.
517    ///
518    /// # Example
519    ///
520    /// ```rust
521    /// use elara_runtime::health::HealthCheckConfig;
522    ///
523    /// let mut config = HealthCheckConfig::disabled();
524    /// config.enabled = true;
525    /// config.max_memory_mb = Some(2000); // Only enable memory check
526    /// ```
527    pub fn disabled() -> Self {
528        Self {
529            enabled: false,
530            server_bind_address: None,
531            cache_ttl: Duration::from_secs(30),
532            min_connections: None,
533            max_memory_mb: None,
534            max_time_drift_ms: None,
535            max_pending_events: None,
536        }
537    }
538
539    /// Creates a configuration for small deployments (10 nodes).
540    ///
541    /// Recommended thresholds:
542    /// - Min connections: 2
543    /// - Max memory: 1000 MB
544    /// - Max time drift: 100 ms
545    /// - Max pending events: 500
546    pub fn small_deployment() -> Self {
547        Self {
548            enabled: true,
549            server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
550            cache_ttl: Duration::from_secs(30),
551            min_connections: Some(2),
552            max_memory_mb: Some(1000),
553            max_time_drift_ms: Some(100),
554            max_pending_events: Some(500),
555        }
556    }
557
558    /// Creates a configuration for medium deployments (100 nodes).
559    ///
560    /// Recommended thresholds:
561    /// - Min connections: 5
562    /// - Max memory: 2000 MB
563    /// - Max time drift: 100 ms
564    /// - Max pending events: 1000
565    pub fn medium_deployment() -> Self {
566        Self {
567            enabled: true,
568            server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
569            cache_ttl: Duration::from_secs(30),
570            min_connections: Some(5),
571            max_memory_mb: Some(2000),
572            max_time_drift_ms: Some(100),
573            max_pending_events: Some(1000),
574        }
575    }
576
577    /// Creates a configuration for large deployments (1000 nodes).
578    ///
579    /// Recommended thresholds:
580    /// - Min connections: 10
581    /// - Max memory: 4000 MB
582    /// - Max time drift: 100 ms
583    /// - Max pending events: 2000
584    pub fn large_deployment() -> Self {
585        Self {
586            enabled: true,
587            server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
588            cache_ttl: Duration::from_secs(30),
589            min_connections: Some(10),
590            max_memory_mb: Some(4000),
591            max_time_drift_ms: Some(100),
592            max_pending_events: Some(2000),
593        }
594    }
595
596    /// Validates the configuration.
597    ///
598    /// Returns `Ok(())` if the configuration is valid, or an error message
599    /// describing the validation failure.
600    ///
601    /// # Validation Rules
602    ///
603    /// - `cache_ttl` must be at least 1 second
604    /// - If `min_connections` is set, it must be > 0
605    /// - If `max_memory_mb` is set, it must be > 0
606    /// - If `max_time_drift_ms` is set, it must be > 0
607    /// - If `max_pending_events` is set, it must be > 0
608    pub fn validate(&self) -> Result<(), String> {
609        if self.cache_ttl < Duration::from_secs(1) {
610            return Err("cache_ttl must be at least 1 second".to_string());
611        }
612
613        if let Some(min_conn) = self.min_connections {
614            if min_conn == 0 {
615                return Err("min_connections must be greater than 0".to_string());
616            }
617        }
618
619        if let Some(max_mem) = self.max_memory_mb {
620            if max_mem == 0 {
621                return Err("max_memory_mb must be greater than 0".to_string());
622            }
623        }
624
625        if let Some(max_drift) = self.max_time_drift_ms {
626            if max_drift <= 0 {
627                return Err("max_time_drift_ms must be greater than 0".to_string());
628            }
629        }
630
631        if let Some(max_events) = self.max_pending_events {
632            if max_events == 0 {
633                return Err("max_pending_events must be greater than 0".to_string());
634            }
635        }
636
637        Ok(())
638    }
639}
640
641/// Health checker that orchestrates multiple health checks with caching.
642///
643/// The `HealthChecker` runs registered health checks and caches the results
644/// for a configurable TTL to avoid excessive checking overhead. This is
645/// particularly important when health checks are expensive (e.g., database
646/// queries, external service calls).
647///
648/// # Thread Safety
649///
650/// The `HealthChecker` is thread-safe and can be shared across threads using
651/// `Arc`. The internal cache is protected by a `RwLock` for concurrent access.
652///
653/// # Caching Behavior
654///
655/// - Health check results are cached for `cache_ttl` duration
656/// - Cached results are returned if still valid (not expired)
657/// - Expired results trigger a new health check execution
658/// - Cache updates are atomic and thread-safe
659///
660/// # Example
661///
662/// ```rust,no_run
663/// use elara_runtime::health::{HealthChecker, HealthCheck, HealthCheckResult};
664/// use std::time::Duration;
665/// use std::sync::Arc;
666///
667/// struct MyCheck;
668/// impl HealthCheck for MyCheck {
669///     fn name(&self) -> &str { "my_check" }
670///     fn check(&self) -> HealthCheckResult { HealthCheckResult::Healthy }
671/// }
672///
673/// let mut checker = HealthChecker::new(Duration::from_secs(30));
674/// checker.add_check(Box::new(MyCheck));
675///
676/// // First call executes checks
677/// let status1 = checker.check_health();
678///
679/// // Second call within TTL returns cached result
680/// let status2 = checker.check_health();
681/// ```
682pub struct HealthChecker {
683    /// Registered health checks
684    checks: Vec<Box<dyn HealthCheck>>,
685    
686    /// Cached health status with timestamp
687    cache: Arc<RwLock<Option<HealthStatus>>>,
688    
689    /// Cache time-to-live
690    cache_ttl: Duration,
691}
692
693impl HealthChecker {
694    /// Creates a new HealthChecker with the specified cache TTL.
695    ///
696    /// # Arguments
697    ///
698    /// * `cache_ttl` - Duration for which health check results are cached
699    ///
700    /// # Example
701    ///
702    /// ```rust
703    /// use elara_runtime::health::HealthChecker;
704    /// use std::time::Duration;
705    ///
706    /// let checker = HealthChecker::new(Duration::from_secs(30));
707    /// ```
708    pub fn new(cache_ttl: Duration) -> Self {
709        Self {
710            checks: Vec::new(),
711            cache: Arc::new(RwLock::new(None)),
712            cache_ttl,
713        }
714    }
715    
716    /// Creates a new HealthChecker with default configuration.
717    ///
718    /// Uses a default cache TTL of 30 seconds.
719    pub fn with_default_config() -> Self {
720        Self::new(HealthCheckerConfig::default().cache_ttl)
721    }
722    
723    /// Creates a new HealthChecker with the specified configuration.
724    pub fn with_config(config: HealthCheckerConfig) -> Self {
725        Self::new(config.cache_ttl)
726    }
727    
728    /// Adds a health check to the checker.
729    ///
730    /// Health checks are executed in the order they are added.
731    ///
732    /// # Arguments
733    ///
734    /// * `check` - Boxed health check implementation
735    ///
736    /// # Example
737    ///
738    /// ```rust,no_run
739    /// use elara_runtime::health::{HealthChecker, HealthCheck, HealthCheckResult};
740    /// use std::time::Duration;
741    ///
742    /// struct MyCheck;
743    /// impl HealthCheck for MyCheck {
744    ///     fn name(&self) -> &str { "my_check" }
745    ///     fn check(&self) -> HealthCheckResult { HealthCheckResult::Healthy }
746    /// }
747    ///
748    /// let mut checker = HealthChecker::new(Duration::from_secs(30));
749    /// checker.add_check(Box::new(MyCheck));
750    /// ```
751    pub fn add_check(&mut self, check: Box<dyn HealthCheck>) {
752        self.checks.push(check);
753    }
754    
755    /// Checks the health of all registered checks.
756    ///
757    /// This method returns cached results if they are still valid (within TTL).
758    /// If the cache is expired or empty, it executes all health checks and
759    /// updates the cache.
760    ///
761    /// # Returns
762    ///
763    /// `HealthStatus` containing the overall status and individual check results.
764    ///
765    /// # Performance
766    ///
767    /// - Cached reads: O(1) with read lock
768    /// - Cache miss: O(n) where n is the number of checks, with write lock
769    ///
770    /// # Example
771    ///
772    /// ```rust,no_run
773    /// use elara_runtime::health::HealthChecker;
774    /// use std::time::Duration;
775    ///
776    /// let checker = HealthChecker::new(Duration::from_secs(30));
777    /// let status = checker.check_health();
778    ///
779    /// if status.is_healthy() {
780    ///     println!("All systems operational");
781    /// } else {
782    ///     println!("System degraded or unhealthy");
783    /// }
784    /// ```
785    pub fn check_health(&self) -> HealthStatus {
786        // Fast path: check if cache is valid
787        {
788            let cache = self.cache.read();
789            if let Some(ref status) = *cache {
790                if status.timestamp.elapsed() < self.cache_ttl {
791                    return status.clone();
792                }
793            }
794        }
795        
796        // Slow path: execute health checks and update cache
797        let mut results = HashMap::new();
798        
799        for check in &self.checks {
800            let result = check.check();
801            results.insert(check.name().to_string(), result);
802        }
803        
804        let status = HealthStatus::new(results);
805        
806        // Update cache
807        {
808            let mut cache = self.cache.write();
809            *cache = Some(status.clone());
810        }
811        
812        status
813    }
814    
815    /// Clears the cached health status, forcing the next check to execute.
816    ///
817    /// This is useful for testing or when you need to force a fresh health check.
818    pub fn clear_cache(&self) {
819        let mut cache = self.cache.write();
820        *cache = None;
821    }
822    
823    /// Returns the number of registered health checks.
824    pub fn check_count(&self) -> usize {
825        self.checks.len()
826    }
827    
828    /// Returns the cache TTL duration.
829    pub fn cache_ttl(&self) -> Duration {
830        self.cache_ttl
831    }
832}
833
834#[cfg(test)]
835mod tests {
836    use super::*;
837    
838    struct AlwaysHealthyCheck;
839    impl HealthCheck for AlwaysHealthyCheck {
840        fn name(&self) -> &str {
841            "always_healthy"
842        }
843        fn check(&self) -> HealthCheckResult {
844            HealthCheckResult::Healthy
845        }
846    }
847    
848    struct AlwaysDegradedCheck;
849    impl HealthCheck for AlwaysDegradedCheck {
850        fn name(&self) -> &str {
851            "always_degraded"
852        }
853        fn check(&self) -> HealthCheckResult {
854            HealthCheckResult::Degraded {
855                reason: "Test degradation".to_string(),
856            }
857        }
858    }
859    
860    struct AlwaysUnhealthyCheck;
861    impl HealthCheck for AlwaysUnhealthyCheck {
862        fn name(&self) -> &str {
863            "always_unhealthy"
864        }
865        fn check(&self) -> HealthCheckResult {
866            HealthCheckResult::Unhealthy {
867                reason: "Test failure".to_string(),
868            }
869        }
870    }
871    
872    #[test]
873    fn test_health_check_result_methods() {
874        let healthy = HealthCheckResult::Healthy;
875        assert!(healthy.is_healthy());
876        assert!(!healthy.is_degraded());
877        assert!(!healthy.is_unhealthy());
878        assert_eq!(healthy.reason(), None);
879        
880        let degraded = HealthCheckResult::Degraded {
881            reason: "test".to_string(),
882        };
883        assert!(!degraded.is_healthy());
884        assert!(degraded.is_degraded());
885        assert!(!degraded.is_unhealthy());
886        assert_eq!(degraded.reason(), Some("test"));
887        
888        let unhealthy = HealthCheckResult::Unhealthy {
889            reason: "test".to_string(),
890        };
891        assert!(!unhealthy.is_healthy());
892        assert!(!unhealthy.is_degraded());
893        assert!(unhealthy.is_unhealthy());
894        assert_eq!(unhealthy.reason(), Some("test"));
895    }
896    
897    #[test]
898    fn test_health_checker_empty() {
899        let checker = HealthChecker::new(Duration::from_secs(30));
900        let status = checker.check_health();
901        assert!(status.is_healthy());
902        assert_eq!(status.checks.len(), 0);
903    }
904    
905    #[test]
906    fn test_health_checker_all_healthy() {
907        let mut checker = HealthChecker::new(Duration::from_secs(30));
908        checker.add_check(Box::new(AlwaysHealthyCheck));
909        
910        let status = checker.check_health();
911        assert!(status.is_healthy());
912        assert_eq!(status.checks.len(), 1);
913    }
914    
915    #[test]
916    fn test_health_checker_degraded() {
917        let mut checker = HealthChecker::new(Duration::from_secs(30));
918        checker.add_check(Box::new(AlwaysHealthyCheck));
919        checker.add_check(Box::new(AlwaysDegradedCheck));
920        
921        let status = checker.check_health();
922        assert!(status.is_degraded());
923        assert_eq!(status.checks.len(), 2);
924    }
925    
926    #[test]
927    fn test_health_checker_unhealthy() {
928        let mut checker = HealthChecker::new(Duration::from_secs(30));
929        checker.add_check(Box::new(AlwaysHealthyCheck));
930        checker.add_check(Box::new(AlwaysDegradedCheck));
931        checker.add_check(Box::new(AlwaysUnhealthyCheck));
932        
933        let status = checker.check_health();
934        assert!(status.is_unhealthy());
935        assert_eq!(status.checks.len(), 3);
936    }
937    
938    #[test]
939    fn test_health_checker_precedence() {
940        // Unhealthy takes precedence over Degraded
941        let mut checker = HealthChecker::new(Duration::from_secs(30));
942        checker.add_check(Box::new(AlwaysDegradedCheck));
943        checker.add_check(Box::new(AlwaysUnhealthyCheck));
944        
945        let status = checker.check_health();
946        assert!(status.is_unhealthy());
947        
948        // Degraded takes precedence over Healthy
949        let mut checker = HealthChecker::new(Duration::from_secs(30));
950        checker.add_check(Box::new(AlwaysHealthyCheck));
951        checker.add_check(Box::new(AlwaysDegradedCheck));
952        
953        let status = checker.check_health();
954        assert!(status.is_degraded());
955    }
956    
957    #[test]
958    fn test_health_checker_caching() {
959        let mut checker = HealthChecker::new(Duration::from_millis(100));
960        checker.add_check(Box::new(AlwaysHealthyCheck));
961        
962        // First call should execute checks
963        let status1 = checker.check_health();
964        let timestamp1 = status1.timestamp;
965        
966        // Second call within TTL should return cached result
967        let status2 = checker.check_health();
968        let timestamp2 = status2.timestamp;
969        
970        assert_eq!(timestamp1, timestamp2);
971        
972        // Wait for cache to expire
973        std::thread::sleep(Duration::from_millis(150));
974        
975        // Third call after TTL should execute checks again
976        let status3 = checker.check_health();
977        let timestamp3 = status3.timestamp;
978        
979        assert!(timestamp3 > timestamp1);
980    }
981    
982    #[test]
983    fn test_health_checker_clear_cache() {
984        let mut checker = HealthChecker::new(Duration::from_secs(30));
985        checker.add_check(Box::new(AlwaysHealthyCheck));
986        
987        let status1 = checker.check_health();
988        let timestamp1 = status1.timestamp;
989        
990        checker.clear_cache();
991        
992        let status2 = checker.check_health();
993        let timestamp2 = status2.timestamp;
994        
995        assert!(timestamp2 > timestamp1);
996    }
997    
998    #[test]
999    fn test_health_status_aggregation() {
1000        let mut checks = HashMap::new();
1001        
1002        // All healthy
1003        checks.insert("check1".to_string(), HealthCheckResult::Healthy);
1004        checks.insert("check2".to_string(), HealthCheckResult::Healthy);
1005        let status = HealthStatus::new(checks.clone());
1006        assert!(status.is_healthy());
1007        
1008        // One degraded
1009        checks.insert("check3".to_string(), HealthCheckResult::Degraded {
1010            reason: "test".to_string(),
1011        });
1012        let status = HealthStatus::new(checks.clone());
1013        assert!(status.is_degraded());
1014        
1015        // One unhealthy
1016        checks.insert("check4".to_string(), HealthCheckResult::Unhealthy {
1017            reason: "test".to_string(),
1018        });
1019        let status = HealthStatus::new(checks);
1020        assert!(status.is_unhealthy());
1021    }
1022}
1023
1024// ============================================================================
1025// Built-in Health Checks
1026// ============================================================================
1027
1028/// Health check for monitoring active connection count.
1029///
1030/// This check verifies that the node has at least a minimum number of active
1031/// connections. Having too few connections may indicate network issues,
1032/// configuration problems, or that the node is isolated from the network.
1033///
1034/// # Status Determination
1035///
1036/// - `Healthy`: Active connections >= min_connections
1037/// - `Degraded`: Active connections < min_connections
1038///
1039/// # Example
1040///
1041/// ```rust,no_run
1042/// use elara_runtime::health::{ConnectionHealthCheck, HealthCheck};
1043/// use elara_runtime::node::Node;
1044/// use std::sync::Arc;
1045///
1046/// let node = Arc::new(Node::new());
1047/// let check = ConnectionHealthCheck::new(node, 3);
1048/// let result = check.check();
1049/// ```
1050pub struct ConnectionHealthCheck {
1051    /// Reference to the node to check
1052    _node: Arc<crate::node::Node>,
1053    /// Minimum number of connections required for healthy status
1054    min_connections: usize,
1055}
1056
1057impl ConnectionHealthCheck {
1058    /// Creates a new ConnectionHealthCheck.
1059    ///
1060    /// # Arguments
1061    ///
1062    /// * `node` - Arc reference to the Node to monitor
1063    /// * `min_connections` - Minimum number of active connections for healthy status
1064    ///
1065    /// # Example
1066    ///
1067    /// ```rust,no_run
1068    /// use elara_runtime::health::ConnectionHealthCheck;
1069    /// use elara_runtime::node::Node;
1070    /// use std::sync::Arc;
1071    ///
1072    /// let node = Arc::new(Node::new());
1073    /// let check = ConnectionHealthCheck::new(node, 3);
1074    /// ```
1075    pub fn new(node: Arc<crate::node::Node>, min_connections: usize) -> Self {
1076        Self {
1077            _node: node,
1078            min_connections,
1079        }
1080    }
1081    
1082    /// Returns the configured minimum connections threshold.
1083    pub fn min_connections(&self) -> usize {
1084        self.min_connections
1085    }
1086}
1087
1088impl HealthCheck for ConnectionHealthCheck {
1089    fn name(&self) -> &str {
1090        "connections"
1091    }
1092    
1093    fn check(&self) -> HealthCheckResult {
1094        // For now, we'll use a placeholder since Node doesn't have active_connections() yet
1095        // In a real implementation, this would query the actual connection count
1096        // from the transport layer or session manager
1097        let active = 0; // TODO: Implement node.active_connections()
1098        
1099        if active >= self.min_connections {
1100            HealthCheckResult::Healthy
1101        } else {
1102            HealthCheckResult::Degraded {
1103                reason: format!(
1104                    "Only {} active connections (minimum: {})",
1105                    active, self.min_connections
1106                ),
1107            }
1108        }
1109    }
1110}
1111
1112/// Health check for monitoring memory usage.
1113///
1114/// This check monitors the process memory usage and compares it against
1115/// a configured maximum threshold. Excessive memory usage can lead to
1116/// OOM kills, performance degradation, and system instability.
1117///
1118/// Uses the `sysinfo` crate to obtain real memory usage statistics.
1119///
1120/// # Status Determination
1121///
1122/// - `Healthy`: Memory usage < max_memory_mb
1123/// - `Unhealthy`: Memory usage >= max_memory_mb
1124///
1125/// # Example
1126///
1127/// ```rust
1128/// use elara_runtime::health::{MemoryHealthCheck, HealthCheck};
1129///
1130/// let check = MemoryHealthCheck::new(1800); // 1800 MB limit
1131/// let result = check.check();
1132/// ```
1133pub struct MemoryHealthCheck {
1134    /// Maximum memory usage in megabytes before unhealthy
1135    max_memory_mb: usize,
1136    /// System information provider (cached for efficiency)
1137    system: Arc<RwLock<sysinfo::System>>,
1138}
1139
1140impl MemoryHealthCheck {
1141    /// Creates a new MemoryHealthCheck.
1142    ///
1143    /// # Arguments
1144    ///
1145    /// * `max_memory_mb` - Maximum memory usage in MB before marking unhealthy
1146    ///
1147    /// # Example
1148    ///
1149    /// ```rust
1150    /// use elara_runtime::health::MemoryHealthCheck;
1151    ///
1152    /// let check = MemoryHealthCheck::new(2048); // 2GB limit
1153    /// ```
1154    pub fn new(max_memory_mb: usize) -> Self {
1155        Self {
1156            max_memory_mb,
1157            system: Arc::new(RwLock::new(sysinfo::System::new_all())),
1158        }
1159    }
1160    
1161    /// Returns the configured maximum memory threshold in MB.
1162    pub fn max_memory_mb(&self) -> usize {
1163        self.max_memory_mb
1164    }
1165    
1166    /// Gets the current memory usage in megabytes.
1167    ///
1168    /// This method refreshes the system memory information and returns
1169    /// the current process memory usage.
1170    fn get_memory_usage_mb(&self) -> usize {
1171        let mut system = self.system.write();
1172        system.refresh_memory();
1173        system.refresh_processes();
1174        
1175        // Get current process PID
1176        let pid = sysinfo::get_current_pid().ok();
1177        
1178        if let Some(pid) = pid {
1179            if let Some(process) = system.process(pid) {
1180                // Convert bytes to megabytes
1181                return (process.memory() / 1_048_576) as usize;
1182            }
1183        }
1184        
1185        // Fallback: return 0 if we can't get process info
1186        0
1187    }
1188}
1189
1190impl HealthCheck for MemoryHealthCheck {
1191    fn name(&self) -> &str {
1192        "memory"
1193    }
1194    
1195    fn check(&self) -> HealthCheckResult {
1196        let usage_mb = self.get_memory_usage_mb();
1197        
1198        if usage_mb < self.max_memory_mb {
1199            HealthCheckResult::Healthy
1200        } else {
1201            HealthCheckResult::Unhealthy {
1202                reason: format!(
1203                    "Memory usage {}MB exceeds limit {}MB",
1204                    usage_mb, self.max_memory_mb
1205                ),
1206            }
1207        }
1208    }
1209}
1210
1211/// Health check for monitoring time drift.
1212///
1213/// This check monitors the time drift between the local node and the
1214/// network consensus time. Excessive time drift can cause synchronization
1215/// issues, event ordering problems, and state divergence.
1216///
1217/// # Status Determination
1218///
1219/// - `Healthy`: |time_drift| < max_drift_ms
1220/// - `Degraded`: |time_drift| >= max_drift_ms
1221///
1222/// # Example
1223///
1224/// ```rust,no_run
1225/// use elara_runtime::health::{TimeDriftCheck, HealthCheck};
1226/// use elara_runtime::node::Node;
1227/// use std::sync::Arc;
1228///
1229/// let node = Arc::new(Node::new());
1230/// let check = TimeDriftCheck::new(node, 100); // 100ms max drift
1231/// let result = check.check();
1232/// ```
1233pub struct TimeDriftCheck {
1234    /// Reference to the node to check
1235    node: Arc<crate::node::Node>,
1236    /// Maximum acceptable time drift in milliseconds
1237    max_drift_ms: i64,
1238}
1239
1240impl TimeDriftCheck {
1241    /// Creates a new TimeDriftCheck.
1242    ///
1243    /// # Arguments
1244    ///
1245    /// * `node` - Arc reference to the Node to monitor
1246    /// * `max_drift_ms` - Maximum acceptable time drift in milliseconds
1247    ///
1248    /// # Example
1249    ///
1250    /// ```rust,no_run
1251    /// use elara_runtime::health::TimeDriftCheck;
1252    /// use elara_runtime::node::Node;
1253    /// use std::sync::Arc;
1254    ///
1255    /// let node = Arc::new(Node::new());
1256    /// let check = TimeDriftCheck::new(node, 100);
1257    /// ```
1258    pub fn new(node: Arc<crate::node::Node>, max_drift_ms: i64) -> Self {
1259        Self {
1260            node,
1261            max_drift_ms,
1262        }
1263    }
1264    
1265    /// Returns the configured maximum drift threshold in milliseconds.
1266    pub fn max_drift_ms(&self) -> i64 {
1267        self.max_drift_ms
1268    }
1269    
1270    /// Gets the current time drift in milliseconds.
1271    ///
1272    /// This queries the time engine to determine the drift between
1273    /// local time and network consensus time.
1274    fn get_time_drift_ms(&self) -> i64 {
1275        // Access the time engine to get drift information
1276        let time_engine = self.node.time_engine();
1277        
1278        // Get the current drift from the time engine
1279        // The drift is the difference between local time and network time
1280        time_engine.drift_ms()
1281    }
1282}
1283
1284impl HealthCheck for TimeDriftCheck {
1285    fn name(&self) -> &str {
1286        "time_drift"
1287    }
1288    
1289    fn check(&self) -> HealthCheckResult {
1290        let drift_ms = self.get_time_drift_ms();
1291        let abs_drift = drift_ms.abs();
1292        
1293        if abs_drift < self.max_drift_ms {
1294            HealthCheckResult::Healthy
1295        } else {
1296            HealthCheckResult::Degraded {
1297                reason: format!(
1298                    "Time drift {}ms exceeds limit {}ms",
1299                    drift_ms, self.max_drift_ms
1300                ),
1301            }
1302        }
1303    }
1304}
1305
1306/// Health check for monitoring state convergence.
1307///
1308/// This check monitors the state reconciliation engine to ensure that
1309/// state is converging properly across the network. State divergence
1310/// can indicate network partitions, bugs in the reconciliation logic,
1311/// or other serious issues.
1312///
1313/// # Status Determination
1314///
1315/// - `Healthy`: State is converging normally
1316/// - `Degraded`: State convergence is slow or stalled
1317/// - `Unhealthy`: State divergence detected
1318///
1319/// # Example
1320///
1321/// ```rust,no_run
1322/// use elara_runtime::health::{StateDivergenceCheck, HealthCheck};
1323/// use elara_runtime::node::Node;
1324/// use std::sync::Arc;
1325///
1326/// let node = Arc::new(Node::new());
1327/// let check = StateDivergenceCheck::new(node);
1328/// let result = check.check();
1329/// ```
1330pub struct StateDivergenceCheck {
1331    /// Reference to the node to check
1332    node: Arc<crate::node::Node>,
1333    /// Maximum acceptable pending events before degraded
1334    max_pending_events: usize,
1335}
1336
1337impl StateDivergenceCheck {
1338    /// Creates a new StateDivergenceCheck.
1339    ///
1340    /// # Arguments
1341    ///
1342    /// * `node` - Arc reference to the Node to monitor
1343    ///
1344    /// # Example
1345    ///
1346    /// ```rust,no_run
1347    /// use elara_runtime::health::StateDivergenceCheck;
1348    /// use elara_runtime::node::Node;
1349    /// use std::sync::Arc;
1350    ///
1351    /// let node = Arc::new(Node::new());
1352    /// let check = StateDivergenceCheck::new(node);
1353    /// ```
1354    pub fn new(node: Arc<crate::node::Node>) -> Self {
1355        Self::with_threshold(node, 1000)
1356    }
1357    
1358    /// Creates a new StateDivergenceCheck with a custom threshold.
1359    ///
1360    /// # Arguments
1361    ///
1362    /// * `node` - Arc reference to the Node to monitor
1363    /// * `max_pending_events` - Maximum pending events before degraded status
1364    ///
1365    /// # Example
1366    ///
1367    /// ```rust,no_run
1368    /// use elara_runtime::health::StateDivergenceCheck;
1369    /// use elara_runtime::node::Node;
1370    /// use std::sync::Arc;
1371    ///
1372    /// let node = Arc::new(Node::new());
1373    /// let check = StateDivergenceCheck::with_threshold(node, 500);
1374    /// ```
1375    pub fn with_threshold(node: Arc<crate::node::Node>, max_pending_events: usize) -> Self {
1376        Self {
1377            node,
1378            max_pending_events,
1379        }
1380    }
1381    
1382    /// Returns the configured maximum pending events threshold.
1383    pub fn max_pending_events(&self) -> usize {
1384        self.max_pending_events
1385    }
1386    
1387    /// Checks the state convergence status.
1388    ///
1389    /// This examines the reconciliation engine to determine if state
1390    /// is converging properly.
1391    fn check_convergence(&self) -> (bool, usize) {
1392        // Access the state engine to check convergence
1393        let state_engine = self.node.state_engine();
1394        
1395        // Get the number of pending events that haven't been reconciled
1396        // In a real implementation, this would query the reconciliation engine
1397        // for metrics about pending events, unmerged states, etc.
1398        let pending_events = state_engine.pending_count();
1399        
1400        // Check if we're converging (pending count is reasonable)
1401        let is_converging = pending_events < self.max_pending_events;
1402        
1403        (is_converging, pending_events)
1404    }
1405}
1406
1407impl HealthCheck for StateDivergenceCheck {
1408    fn name(&self) -> &str {
1409        "state_convergence"
1410    }
1411    
1412    fn check(&self) -> HealthCheckResult {
1413        let (is_converging, pending_events) = self.check_convergence();
1414        
1415        if is_converging {
1416            HealthCheckResult::Healthy
1417        } else {
1418            HealthCheckResult::Degraded {
1419                reason: format!(
1420                    "State convergence slow: {} pending events (threshold: {})",
1421                    pending_events, self.max_pending_events
1422                ),
1423            }
1424        }
1425    }
1426}
1427
1428#[cfg(test)]
1429mod builtin_tests {
1430    use super::*;
1431    use crate::node::Node;
1432    
1433    #[test]
1434    fn test_memory_health_check() {
1435        // Test with a very high threshold (should be healthy)
1436        let check = MemoryHealthCheck::new(100_000); // 100GB
1437        let result = check.check();
1438        assert!(result.is_healthy(), "Should be healthy with high threshold");
1439        
1440        // Test with a very low threshold (should be unhealthy)
1441        let check = MemoryHealthCheck::new(1); // 1MB
1442        let result = check.check();
1443        assert!(result.is_unhealthy(), "Should be unhealthy with low threshold");
1444    }
1445    
1446    #[test]
1447    fn test_memory_health_check_threshold() {
1448        let check = MemoryHealthCheck::new(1800);
1449        assert_eq!(check.max_memory_mb(), 1800);
1450    }
1451    
1452    #[test]
1453    fn test_connection_health_check_creation() {
1454        let node = Arc::new(Node::new());
1455        let check = ConnectionHealthCheck::new(node, 3);
1456        assert_eq!(check.name(), "connections");
1457        assert_eq!(check.min_connections(), 3);
1458    }
1459    
1460    #[test]
1461    fn test_time_drift_check_creation() {
1462        let node = Arc::new(Node::new());
1463        let check = TimeDriftCheck::new(node, 100);
1464        assert_eq!(check.name(), "time_drift");
1465        assert_eq!(check.max_drift_ms(), 100);
1466    }
1467    
1468    #[test]
1469    fn test_state_divergence_check_creation() {
1470        let node = Arc::new(Node::new());
1471        let check = StateDivergenceCheck::new(node.clone());
1472        assert_eq!(check.name(), "state_convergence");
1473        assert_eq!(check.max_pending_events(), 1000);
1474        
1475        let check = StateDivergenceCheck::with_threshold(node, 500);
1476        assert_eq!(check.max_pending_events(), 500);
1477    }
1478    
1479    #[test]
1480    fn test_all_builtin_checks_with_health_checker() {
1481        let node = Arc::new(Node::new());
1482        let mut checker = HealthChecker::new(Duration::from_secs(30));
1483        
1484        // Add all built-in checks
1485        checker.add_check(Box::new(ConnectionHealthCheck::new(node.clone(), 3)));
1486        checker.add_check(Box::new(MemoryHealthCheck::new(100_000)));
1487        checker.add_check(Box::new(TimeDriftCheck::new(node.clone(), 100)));
1488        checker.add_check(Box::new(StateDivergenceCheck::new(node)));
1489        
1490        assert_eq!(checker.check_count(), 4);
1491        
1492        // Run health check
1493        let status = checker.check_health();
1494        assert_eq!(status.checks.len(), 4);
1495        
1496        // Verify all checks are present
1497        assert!(status.checks.contains_key("connections"));
1498        assert!(status.checks.contains_key("memory"));
1499        assert!(status.checks.contains_key("time_drift"));
1500        assert!(status.checks.contains_key("state_convergence"));
1501    }
1502    
1503    #[test]
1504    fn test_health_check_config_default() {
1505        let config = HealthCheckConfig::default();
1506        
1507        assert!(config.enabled);
1508        assert!(config.server_bind_address.is_some());
1509        assert_eq!(config.cache_ttl, Duration::from_secs(30));
1510        assert_eq!(config.min_connections, Some(3));
1511        assert_eq!(config.max_memory_mb, Some(1800));
1512        assert_eq!(config.max_time_drift_ms, Some(100));
1513        assert_eq!(config.max_pending_events, Some(1000));
1514        
1515        // Default config should be valid
1516        assert!(config.validate().is_ok());
1517    }
1518    
1519    #[test]
1520    fn test_health_check_config_disabled() {
1521        let config = HealthCheckConfig::disabled();
1522        
1523        assert!(!config.enabled);
1524        assert!(config.server_bind_address.is_none());
1525        assert!(config.min_connections.is_none());
1526        assert!(config.max_memory_mb.is_none());
1527        assert!(config.max_time_drift_ms.is_none());
1528        assert!(config.max_pending_events.is_none());
1529        
1530        // Disabled config should still be valid
1531        assert!(config.validate().is_ok());
1532    }
1533    
1534    #[test]
1535    fn test_health_check_config_small_deployment() {
1536        let config = HealthCheckConfig::small_deployment();
1537        
1538        assert!(config.enabled);
1539        assert_eq!(config.min_connections, Some(2));
1540        assert_eq!(config.max_memory_mb, Some(1000));
1541        assert_eq!(config.max_time_drift_ms, Some(100));
1542        assert_eq!(config.max_pending_events, Some(500));
1543        
1544        assert!(config.validate().is_ok());
1545    }
1546    
1547    #[test]
1548    fn test_health_check_config_medium_deployment() {
1549        let config = HealthCheckConfig::medium_deployment();
1550        
1551        assert!(config.enabled);
1552        assert_eq!(config.min_connections, Some(5));
1553        assert_eq!(config.max_memory_mb, Some(2000));
1554        assert_eq!(config.max_time_drift_ms, Some(100));
1555        assert_eq!(config.max_pending_events, Some(1000));
1556        
1557        assert!(config.validate().is_ok());
1558    }
1559    
1560    #[test]
1561    fn test_health_check_config_large_deployment() {
1562        let config = HealthCheckConfig::large_deployment();
1563        
1564        assert!(config.enabled);
1565        assert_eq!(config.min_connections, Some(10));
1566        assert_eq!(config.max_memory_mb, Some(4000));
1567        assert_eq!(config.max_time_drift_ms, Some(100));
1568        assert_eq!(config.max_pending_events, Some(2000));
1569        
1570        assert!(config.validate().is_ok());
1571    }
1572    
1573    #[test]
1574    fn test_health_check_config_validation_cache_ttl() {
1575        let mut config = HealthCheckConfig::default();
1576        
1577        // Valid cache TTL
1578        config.cache_ttl = Duration::from_secs(1);
1579        assert!(config.validate().is_ok());
1580        
1581        // Invalid cache TTL (too short)
1582        config.cache_ttl = Duration::from_millis(999);
1583        assert!(config.validate().is_err());
1584        assert_eq!(
1585            config.validate().unwrap_err(),
1586            "cache_ttl must be at least 1 second"
1587        );
1588    }
1589    
1590    #[test]
1591    fn test_health_check_config_validation_min_connections() {
1592        let mut config = HealthCheckConfig::default();
1593        
1594        // Valid min_connections
1595        config.min_connections = Some(1);
1596        assert!(config.validate().is_ok());
1597        
1598        // Invalid min_connections (zero)
1599        config.min_connections = Some(0);
1600        assert!(config.validate().is_err());
1601        assert_eq!(
1602            config.validate().unwrap_err(),
1603            "min_connections must be greater than 0"
1604        );
1605        
1606        // None is valid (check disabled)
1607        config.min_connections = None;
1608        assert!(config.validate().is_ok());
1609    }
1610    
1611    #[test]
1612    fn test_health_check_config_validation_max_memory() {
1613        let mut config = HealthCheckConfig::default();
1614        
1615        // Valid max_memory_mb
1616        config.max_memory_mb = Some(1);
1617        assert!(config.validate().is_ok());
1618        
1619        // Invalid max_memory_mb (zero)
1620        config.max_memory_mb = Some(0);
1621        assert!(config.validate().is_err());
1622        assert_eq!(
1623            config.validate().unwrap_err(),
1624            "max_memory_mb must be greater than 0"
1625        );
1626        
1627        // None is valid (check disabled)
1628        config.max_memory_mb = None;
1629        assert!(config.validate().is_ok());
1630    }
1631    
1632    #[test]
1633    fn test_health_check_config_validation_max_time_drift() {
1634        let mut config = HealthCheckConfig::default();
1635        
1636        // Valid max_time_drift_ms
1637        config.max_time_drift_ms = Some(1);
1638        assert!(config.validate().is_ok());
1639        
1640        // Invalid max_time_drift_ms (zero)
1641        config.max_time_drift_ms = Some(0);
1642        assert!(config.validate().is_err());
1643        assert_eq!(
1644            config.validate().unwrap_err(),
1645            "max_time_drift_ms must be greater than 0"
1646        );
1647        
1648        // Invalid max_time_drift_ms (negative)
1649        config.max_time_drift_ms = Some(-1);
1650        assert!(config.validate().is_err());
1651        
1652        // None is valid (check disabled)
1653        config.max_time_drift_ms = None;
1654        assert!(config.validate().is_ok());
1655    }
1656    
1657    #[test]
1658    fn test_health_check_config_validation_max_pending_events() {
1659        let mut config = HealthCheckConfig::default();
1660        
1661        // Valid max_pending_events
1662        config.max_pending_events = Some(1);
1663        assert!(config.validate().is_ok());
1664        
1665        // Invalid max_pending_events (zero)
1666        config.max_pending_events = Some(0);
1667        assert!(config.validate().is_err());
1668        assert_eq!(
1669            config.validate().unwrap_err(),
1670            "max_pending_events must be greater than 0"
1671        );
1672        
1673        // None is valid (check disabled)
1674        config.max_pending_events = None;
1675        assert!(config.validate().is_ok());
1676    }
1677    
1678    #[test]
1679    fn test_health_check_config_selective_checks() {
1680        // Test configuration with only some checks enabled
1681        let config = HealthCheckConfig {
1682            enabled: true,
1683            server_bind_address: None,
1684            cache_ttl: Duration::from_secs(30),
1685            min_connections: Some(5),
1686            max_memory_mb: None, // Disabled
1687            max_time_drift_ms: Some(100),
1688            max_pending_events: None, // Disabled
1689        };
1690        
1691        assert!(config.validate().is_ok());
1692        assert!(config.enabled);
1693        assert!(config.min_connections.is_some());
1694        assert!(config.max_memory_mb.is_none());
1695        assert!(config.max_time_drift_ms.is_some());
1696        assert!(config.max_pending_events.is_none());
1697    }
1698}
elara_runtime/health.rs

elara_runtime/
health.rs