elara_runtime/health.rs
1//! Health Check System for ELARA Runtime
2//!
3//! This module provides a production-grade health checking system with:
4//! - Pluggable health checks via the `HealthCheck` trait
5//! - Result caching with configurable TTL to avoid excessive checking
6//! - Proper aggregation logic (Unhealthy > Degraded > Healthy)
7//! - Thread-safe operation with Arc<RwLock<>>
8//! - Support for Kubernetes liveness and readiness probes
9//! - Four built-in health checks for common monitoring needs
10//!
11//! # Architecture
12//!
13//! The health check system consists of:
14//! - `HealthCheck` trait: Defines the interface for individual health checks
15//! - `HealthChecker`: Orchestrates multiple health checks and caches results
16//! - `HealthStatus`: Aggregated health status with individual check results
17//! - `HealthCheckResult`: Result of an individual health check
18//!
19//! # Built-in Health Checks
20//!
21//! The module provides four production-ready health checks:
22//!
23//! ## 1. ConnectionHealthCheck
24//!
25//! Monitors the number of active connections to ensure the node is properly
26//! connected to the network. Returns `Degraded` if the connection count falls
27//! below the configured minimum.
28//!
29//! **Use case:** Detect network isolation or connectivity issues
30//!
31//! ## 2. MemoryHealthCheck
32//!
33//! Monitors process memory usage using the `sysinfo` crate to obtain real
34//! system metrics. Returns `Unhealthy` if memory usage exceeds the configured
35//! maximum, which helps prevent OOM kills and performance degradation.
36//!
37//! **Use case:** Detect memory leaks or excessive memory consumption
38//!
39//! ## 3. TimeDriftCheck
40//!
41//! Monitors time drift between the local node and network consensus time.
42//! Returns `Degraded` if drift exceeds the configured threshold. Excessive
43//! time drift can cause synchronization issues and state divergence.
44//!
45//! **Use case:** Detect clock synchronization issues
46//!
47//! ## 4. StateDivergenceCheck
48//!
49//! Monitors the state reconciliation engine to ensure state is converging
50//! properly. Returns `Degraded` if the number of pending events exceeds
51//! the configured threshold, which may indicate network partitions or
52//! reconciliation issues.
53//!
54//! **Use case:** Detect state convergence problems
55//!
56//! # Example
57//!
58//! ```rust,no_run
59//! use elara_runtime::health::{
60//! HealthChecker, HealthCheck, HealthCheckResult,
61//! ConnectionHealthCheck, MemoryHealthCheck, TimeDriftCheck, StateDivergenceCheck
62//! };
63//! use elara_runtime::node::Node;
64//! use std::sync::Arc;
65//! use std::time::Duration;
66//!
67//! // Create a node
68//! let node = Arc::new(Node::new());
69//!
70//! // Create health checker with 30-second cache
71//! let mut checker = HealthChecker::new(Duration::from_secs(30));
72//!
73//! // Add built-in health checks
74//! checker.add_check(Box::new(ConnectionHealthCheck::new(node.clone(), 3)));
75//! checker.add_check(Box::new(MemoryHealthCheck::new(1800))); // 1800 MB
76//! checker.add_check(Box::new(TimeDriftCheck::new(node.clone(), 100))); // 100ms
77//! checker.add_check(Box::new(StateDivergenceCheck::new(node)));
78//!
79//! // Check health
80//! let status = checker.check_health();
81//!
82//! if status.is_healthy() {
83//! println!("All systems operational");
84//! } else if status.is_degraded() {
85//! println!("System degraded: {:?}", status.overall.reason());
86//! } else {
87//! println!("System unhealthy: {:?}", status.overall.reason());
88//! }
89//! ```
90//!
91//! # Production Deployment
92//!
93//! In production, health checks are typically exposed via HTTP endpoints:
94//!
95//! - `/health` - Overall health status (200 OK if healthy/degraded, 503 if unhealthy)
96//! - `/ready` - Readiness probe for Kubernetes (checks if node can accept traffic)
97//! - `/live` - Liveness probe for Kubernetes (checks if node should be restarted)
98//!
99//! Configure thresholds based on your deployment:
100//!
101//! - **Small deployment (10 nodes)**: min_connections=2, max_memory_mb=1000
102//! - **Medium deployment (100 nodes)**: min_connections=5, max_memory_mb=2000
103//! - **Large deployment (1000 nodes)**: min_connections=10, max_memory_mb=4000
104
105use std::collections::HashMap;
106use std::sync::Arc;
107use std::time::{Duration, Instant};
108use parking_lot::RwLock;
109
110/// Result of an individual health check.
111///
112/// Health checks can return one of three states:
113/// - `Healthy`: The component is functioning normally
114/// - `Degraded`: The component is functioning but with reduced capacity or performance
115/// - `Unhealthy`: The component is not functioning correctly
116///
117/// The overall system health is determined by the worst individual check result,
118/// with the precedence: Unhealthy > Degraded > Healthy
119#[derive(Debug, Clone, PartialEq, Eq)]
120pub enum HealthCheckResult {
121 /// The component is functioning normally
122 Healthy,
123
124 /// The component is functioning but degraded
125 ///
126 /// This typically indicates reduced capacity, elevated latency,
127 /// or other non-critical issues that don't prevent operation.
128 Degraded {
129 /// Human-readable reason for the degraded state
130 reason: String,
131 },
132
133 /// The component is not functioning correctly
134 ///
135 /// This indicates a critical issue that prevents normal operation.
136 Unhealthy {
137 /// Human-readable reason for the unhealthy state
138 reason: String,
139 },
140}
141
142impl HealthCheckResult {
143 /// Returns true if the result is Healthy
144 pub fn is_healthy(&self) -> bool {
145 matches!(self, HealthCheckResult::Healthy)
146 }
147
148 /// Returns true if the result is Degraded
149 pub fn is_degraded(&self) -> bool {
150 matches!(self, HealthCheckResult::Degraded { .. })
151 }
152
153 /// Returns true if the result is Unhealthy
154 pub fn is_unhealthy(&self) -> bool {
155 matches!(self, HealthCheckResult::Unhealthy { .. })
156 }
157
158 /// Returns the severity level for ordering (higher is worse)
159 fn severity(&self) -> u8 {
160 match self {
161 HealthCheckResult::Healthy => 0,
162 HealthCheckResult::Degraded { .. } => 1,
163 HealthCheckResult::Unhealthy { .. } => 2,
164 }
165 }
166
167 /// Returns the reason string if available
168 pub fn reason(&self) -> Option<&str> {
169 match self {
170 HealthCheckResult::Healthy => None,
171 HealthCheckResult::Degraded { reason } => Some(reason),
172 HealthCheckResult::Unhealthy { reason } => Some(reason),
173 }
174 }
175}
176
177/// Trait for implementing custom health checks.
178///
179/// Health checks must be `Send + Sync` to allow concurrent execution
180/// across threads. The `check()` method should be relatively fast
181/// (ideally < 10ms) to avoid blocking the health check endpoint.
182///
183/// # Example
184///
185/// ```rust
186/// use elara_runtime::health::{HealthCheck, HealthCheckResult};
187///
188/// struct DatabaseHealthCheck {
189/// connection_pool: Arc<ConnectionPool>,
190/// }
191///
192/// impl HealthCheck for DatabaseHealthCheck {
193/// fn name(&self) -> &str {
194/// "database"
195/// }
196///
197/// fn check(&self) -> HealthCheckResult {
198/// match self.connection_pool.ping() {
199/// Ok(_) => HealthCheckResult::Healthy,
200/// Err(e) => HealthCheckResult::Unhealthy {
201/// reason: format!("Database ping failed: {}", e),
202/// },
203/// }
204/// }
205/// }
206/// ```
207pub trait HealthCheck: Send + Sync {
208 /// Returns the name of this health check.
209 ///
210 /// The name should be unique within a `HealthChecker` and should
211 /// be a valid identifier (lowercase, alphanumeric, underscores).
212 fn name(&self) -> &str;
213
214 /// Performs the health check and returns the result.
215 ///
216 /// This method should be relatively fast (< 10ms ideally) to avoid
217 /// blocking the health check endpoint. For expensive checks, consider
218 /// running them in the background and caching the result.
219 fn check(&self) -> HealthCheckResult;
220}
221
222/// Aggregated health status containing overall status and individual check results.
223///
224/// The overall status is determined by the worst individual check result:
225/// - If any check is Unhealthy, overall is Unhealthy
226/// - Else if any check is Degraded, overall is Degraded
227/// - Else overall is Healthy
228#[derive(Debug, Clone)]
229pub struct HealthStatus {
230 /// Overall health status (worst of all checks)
231 pub overall: HealthCheckResult,
232
233 /// Individual health check results by name
234 pub checks: HashMap<String, HealthCheckResult>,
235
236 /// Timestamp when this status was computed
237 pub timestamp: Instant,
238}
239
240impl HealthStatus {
241 /// Creates a new HealthStatus with the given checks
242 fn new(checks: HashMap<String, HealthCheckResult>) -> Self {
243 let overall = Self::aggregate_results(&checks);
244 Self {
245 overall,
246 checks,
247 timestamp: Instant::now(),
248 }
249 }
250
251 /// Aggregates individual check results into an overall status.
252 ///
253 /// Precedence: Unhealthy > Degraded > Healthy
254 fn aggregate_results(checks: &HashMap<String, HealthCheckResult>) -> HealthCheckResult {
255 if checks.is_empty() {
256 return HealthCheckResult::Healthy;
257 }
258
259 // Find the worst result
260 let worst = checks.values()
261 .max_by_key(|result| result.severity())
262 .unwrap(); // Safe because we checked is_empty
263
264 worst.clone()
265 }
266
267 /// Returns true if the overall status is Healthy
268 pub fn is_healthy(&self) -> bool {
269 self.overall.is_healthy()
270 }
271
272 /// Returns true if the overall status is Degraded
273 pub fn is_degraded(&self) -> bool {
274 self.overall.is_degraded()
275 }
276
277 /// Returns true if the overall status is Unhealthy
278 pub fn is_unhealthy(&self) -> bool {
279 self.overall.is_unhealthy()
280 }
281}
282
283/// Configuration for the health checker
284#[derive(Debug, Clone)]
285pub struct HealthCheckerConfig {
286 /// Time-to-live for cached health check results
287 pub cache_ttl: Duration,
288}
289
290impl Default for HealthCheckerConfig {
291 fn default() -> Self {
292 Self {
293 cache_ttl: Duration::from_secs(30),
294 }
295 }
296}
297
298/// Comprehensive health check configuration for NodeConfig.
299///
300/// This configuration enables and configures the health check system for a node.
301/// When set in `NodeConfig`, the node will automatically initialize health checks
302/// and optionally start an HTTP server to expose health endpoints.
303///
304/// # Health Check System
305///
306/// The health check system provides:
307/// - Built-in checks for connections, memory, time drift, and state convergence
308/// - Configurable thresholds for each check
309/// - HTTP endpoints for Kubernetes probes and load balancers
310/// - Result caching to minimize overhead
311///
312/// # HTTP Endpoints
313///
314/// When `server_bind_address` is set, the following endpoints are exposed:
315/// - `GET /health` - Overall health status (200 OK if healthy/degraded, 503 if unhealthy)
316/// - `GET /ready` - Readiness probe (200 OK if healthy/degraded, 503 if unhealthy)
317/// - `GET /live` - Liveness probe (200 OK if healthy/degraded, 503 if unhealthy)
318///
319/// # Example
320///
321/// ```rust,no_run
322/// use elara_runtime::health::HealthCheckConfig;
323/// use elara_runtime::node::NodeConfig;
324/// use std::time::Duration;
325///
326/// let health_config = HealthCheckConfig {
327/// enabled: true,
328/// server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
329/// cache_ttl: Duration::from_secs(30),
330/// min_connections: Some(3),
331/// max_memory_mb: Some(1800),
332/// max_time_drift_ms: Some(100),
333/// max_pending_events: Some(1000),
334/// };
335///
336/// let node_config = NodeConfig {
337/// health_checks: Some(health_config),
338/// ..Default::default()
339/// };
340/// ```
341///
342/// # Production Recommendations
343///
344/// ## Small Deployment (10 nodes)
345/// ```rust,no_run
346/// use elara_runtime::health::HealthCheckConfig;
347/// use std::time::Duration;
348///
349/// let config = HealthCheckConfig {
350/// enabled: true,
351/// server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
352/// cache_ttl: Duration::from_secs(30),
353/// min_connections: Some(2),
354/// max_memory_mb: Some(1000),
355/// max_time_drift_ms: Some(100),
356/// max_pending_events: Some(500),
357/// };
358/// ```
359///
360/// ## Medium Deployment (100 nodes)
361/// ```rust,no_run
362/// use elara_runtime::health::HealthCheckConfig;
363/// use std::time::Duration;
364///
365/// let config = HealthCheckConfig {
366/// enabled: true,
367/// server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
368/// cache_ttl: Duration::from_secs(30),
369/// min_connections: Some(5),
370/// max_memory_mb: Some(2000),
371/// max_time_drift_ms: Some(100),
372/// max_pending_events: Some(1000),
373/// };
374/// ```
375///
376/// ## Large Deployment (1000 nodes)
377/// ```rust,no_run
378/// use elara_runtime::health::HealthCheckConfig;
379/// use std::time::Duration;
380///
381/// let config = HealthCheckConfig {
382/// enabled: true,
383/// server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
384/// cache_ttl: Duration::from_secs(30),
385/// min_connections: Some(10),
386/// max_memory_mb: Some(4000),
387/// max_time_drift_ms: Some(100),
388/// max_pending_events: Some(2000),
389/// };
390/// ```
391#[derive(Debug, Clone)]
392pub struct HealthCheckConfig {
393 /// Enable or disable health checks.
394 ///
395 /// When `false`, no health checks are performed and no HTTP server is started.
396 /// This allows health checks to be completely disabled in environments where
397 /// they are not needed.
398 ///
399 /// Default: `true`
400 pub enabled: bool,
401
402 /// Optional bind address for the health check HTTP server.
403 ///
404 /// When `Some`, an HTTP server is started on this address to expose health
405 /// check endpoints (`/health`, `/ready`, `/live`). When `None`, health checks
406 /// are still performed but no HTTP server is started (useful for programmatic
407 /// health checking without exposing endpoints).
408 ///
409 /// Format: `"host:port"` (e.g., `"0.0.0.0:8080"`, `"127.0.0.1:8080"`)
410 ///
411 /// Default: `Some("0.0.0.0:8080")`
412 pub server_bind_address: Option<std::net::SocketAddr>,
413
414 /// Cache TTL for health check results.
415 ///
416 /// Health check results are cached for this duration to avoid excessive
417 /// checking overhead. Subsequent health check requests within the TTL
418 /// return cached results.
419 ///
420 /// Recommended values:
421 /// - High-frequency checks: 10-15 seconds
422 /// - Normal checks: 30 seconds
423 /// - Low-frequency checks: 60 seconds
424 ///
425 /// Default: 30 seconds
426 pub cache_ttl: Duration,
427
428 /// Minimum number of active connections for ConnectionHealthCheck.
429 ///
430 /// When `Some`, a `ConnectionHealthCheck` is registered that monitors
431 /// the number of active connections. The check returns `Degraded` if
432 /// the connection count falls below this threshold.
433 ///
434 /// When `None`, no connection health check is performed.
435 ///
436 /// Recommended values:
437 /// - Small deployment: 2-3
438 /// - Medium deployment: 5-10
439 /// - Large deployment: 10-20
440 ///
441 /// Default: `Some(3)`
442 pub min_connections: Option<usize>,
443
444 /// Maximum memory usage in megabytes for MemoryHealthCheck.
445 ///
446 /// When `Some`, a `MemoryHealthCheck` is registered that monitors
447 /// process memory usage. The check returns `Unhealthy` if memory
448 /// usage exceeds this threshold.
449 ///
450 /// When `None`, no memory health check is performed.
451 ///
452 /// Recommended values:
453 /// - Small deployment: 1000 MB (1 GB)
454 /// - Medium deployment: 2000 MB (2 GB)
455 /// - Large deployment: 4000 MB (4 GB)
456 ///
457 /// Set this to 80-90% of your container memory limit to allow for
458 /// graceful degradation before OOM kills.
459 ///
460 /// Default: `Some(1800)` (1.8 GB)
461 pub max_memory_mb: Option<usize>,
462
463 /// Maximum time drift in milliseconds for TimeDriftCheck.
464 ///
465 /// When `Some`, a `TimeDriftCheck` is registered that monitors
466 /// time drift between the local node and network consensus time.
467 /// The check returns `Degraded` if drift exceeds this threshold.
468 ///
469 /// When `None`, no time drift check is performed.
470 ///
471 /// Recommended value: 100 ms
472 ///
473 /// Excessive time drift can cause synchronization issues and state
474 /// divergence in distributed systems.
475 ///
476 /// Default: `Some(100)`
477 pub max_time_drift_ms: Option<i64>,
478
479 /// Maximum pending events for StateDivergenceCheck.
480 ///
481 /// When `Some`, a `StateDivergenceCheck` is registered that monitors
482 /// the state reconciliation engine. The check returns `Degraded` if
483 /// the number of pending events exceeds this threshold.
484 ///
485 /// When `None`, no state divergence check is performed.
486 ///
487 /// Recommended values:
488 /// - Small deployment: 500
489 /// - Medium deployment: 1000
490 /// - Large deployment: 2000
491 ///
492 /// High pending event counts may indicate network partitions or
493 /// reconciliation issues.
494 ///
495 /// Default: `Some(1000)`
496 pub max_pending_events: Option<usize>,
497}
498
499impl Default for HealthCheckConfig {
500 fn default() -> Self {
501 Self {
502 enabled: true,
503 server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
504 cache_ttl: Duration::from_secs(30),
505 min_connections: Some(3),
506 max_memory_mb: Some(1800),
507 max_time_drift_ms: Some(100),
508 max_pending_events: Some(1000),
509 }
510 }
511}
512
513impl HealthCheckConfig {
514 /// Creates a new HealthCheckConfig with all checks disabled.
515 ///
516 /// This is useful when you want to selectively enable only specific checks.
517 ///
518 /// # Example
519 ///
520 /// ```rust
521 /// use elara_runtime::health::HealthCheckConfig;
522 ///
523 /// let mut config = HealthCheckConfig::disabled();
524 /// config.enabled = true;
525 /// config.max_memory_mb = Some(2000); // Only enable memory check
526 /// ```
527 pub fn disabled() -> Self {
528 Self {
529 enabled: false,
530 server_bind_address: None,
531 cache_ttl: Duration::from_secs(30),
532 min_connections: None,
533 max_memory_mb: None,
534 max_time_drift_ms: None,
535 max_pending_events: None,
536 }
537 }
538
539 /// Creates a configuration for small deployments (10 nodes).
540 ///
541 /// Recommended thresholds:
542 /// - Min connections: 2
543 /// - Max memory: 1000 MB
544 /// - Max time drift: 100 ms
545 /// - Max pending events: 500
546 pub fn small_deployment() -> Self {
547 Self {
548 enabled: true,
549 server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
550 cache_ttl: Duration::from_secs(30),
551 min_connections: Some(2),
552 max_memory_mb: Some(1000),
553 max_time_drift_ms: Some(100),
554 max_pending_events: Some(500),
555 }
556 }
557
558 /// Creates a configuration for medium deployments (100 nodes).
559 ///
560 /// Recommended thresholds:
561 /// - Min connections: 5
562 /// - Max memory: 2000 MB
563 /// - Max time drift: 100 ms
564 /// - Max pending events: 1000
565 pub fn medium_deployment() -> Self {
566 Self {
567 enabled: true,
568 server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
569 cache_ttl: Duration::from_secs(30),
570 min_connections: Some(5),
571 max_memory_mb: Some(2000),
572 max_time_drift_ms: Some(100),
573 max_pending_events: Some(1000),
574 }
575 }
576
577 /// Creates a configuration for large deployments (1000 nodes).
578 ///
579 /// Recommended thresholds:
580 /// - Min connections: 10
581 /// - Max memory: 4000 MB
582 /// - Max time drift: 100 ms
583 /// - Max pending events: 2000
584 pub fn large_deployment() -> Self {
585 Self {
586 enabled: true,
587 server_bind_address: Some("0.0.0.0:8080".parse().unwrap()),
588 cache_ttl: Duration::from_secs(30),
589 min_connections: Some(10),
590 max_memory_mb: Some(4000),
591 max_time_drift_ms: Some(100),
592 max_pending_events: Some(2000),
593 }
594 }
595
596 /// Validates the configuration.
597 ///
598 /// Returns `Ok(())` if the configuration is valid, or an error message
599 /// describing the validation failure.
600 ///
601 /// # Validation Rules
602 ///
603 /// - `cache_ttl` must be at least 1 second
604 /// - If `min_connections` is set, it must be > 0
605 /// - If `max_memory_mb` is set, it must be > 0
606 /// - If `max_time_drift_ms` is set, it must be > 0
607 /// - If `max_pending_events` is set, it must be > 0
608 pub fn validate(&self) -> Result<(), String> {
609 if self.cache_ttl < Duration::from_secs(1) {
610 return Err("cache_ttl must be at least 1 second".to_string());
611 }
612
613 if let Some(min_conn) = self.min_connections {
614 if min_conn == 0 {
615 return Err("min_connections must be greater than 0".to_string());
616 }
617 }
618
619 if let Some(max_mem) = self.max_memory_mb {
620 if max_mem == 0 {
621 return Err("max_memory_mb must be greater than 0".to_string());
622 }
623 }
624
625 if let Some(max_drift) = self.max_time_drift_ms {
626 if max_drift <= 0 {
627 return Err("max_time_drift_ms must be greater than 0".to_string());
628 }
629 }
630
631 if let Some(max_events) = self.max_pending_events {
632 if max_events == 0 {
633 return Err("max_pending_events must be greater than 0".to_string());
634 }
635 }
636
637 Ok(())
638 }
639}
640
641/// Health checker that orchestrates multiple health checks with caching.
642///
643/// The `HealthChecker` runs registered health checks and caches the results
644/// for a configurable TTL to avoid excessive checking overhead. This is
645/// particularly important when health checks are expensive (e.g., database
646/// queries, external service calls).
647///
648/// # Thread Safety
649///
650/// The `HealthChecker` is thread-safe and can be shared across threads using
651/// `Arc`. The internal cache is protected by a `RwLock` for concurrent access.
652///
653/// # Caching Behavior
654///
655/// - Health check results are cached for `cache_ttl` duration
656/// - Cached results are returned if still valid (not expired)
657/// - Expired results trigger a new health check execution
658/// - Cache updates are atomic and thread-safe
659///
660/// # Example
661///
662/// ```rust,no_run
663/// use elara_runtime::health::{HealthChecker, HealthCheck, HealthCheckResult};
664/// use std::time::Duration;
665/// use std::sync::Arc;
666///
667/// struct MyCheck;
668/// impl HealthCheck for MyCheck {
669/// fn name(&self) -> &str { "my_check" }
670/// fn check(&self) -> HealthCheckResult { HealthCheckResult::Healthy }
671/// }
672///
673/// let mut checker = HealthChecker::new(Duration::from_secs(30));
674/// checker.add_check(Box::new(MyCheck));
675///
676/// // First call executes checks
677/// let status1 = checker.check_health();
678///
679/// // Second call within TTL returns cached result
680/// let status2 = checker.check_health();
681/// ```
682pub struct HealthChecker {
683 /// Registered health checks
684 checks: Vec<Box<dyn HealthCheck>>,
685
686 /// Cached health status with timestamp
687 cache: Arc<RwLock<Option<HealthStatus>>>,
688
689 /// Cache time-to-live
690 cache_ttl: Duration,
691}
692
693impl HealthChecker {
694 /// Creates a new HealthChecker with the specified cache TTL.
695 ///
696 /// # Arguments
697 ///
698 /// * `cache_ttl` - Duration for which health check results are cached
699 ///
700 /// # Example
701 ///
702 /// ```rust
703 /// use elara_runtime::health::HealthChecker;
704 /// use std::time::Duration;
705 ///
706 /// let checker = HealthChecker::new(Duration::from_secs(30));
707 /// ```
708 pub fn new(cache_ttl: Duration) -> Self {
709 Self {
710 checks: Vec::new(),
711 cache: Arc::new(RwLock::new(None)),
712 cache_ttl,
713 }
714 }
715
716 /// Creates a new HealthChecker with default configuration.
717 ///
718 /// Uses a default cache TTL of 30 seconds.
719 pub fn with_default_config() -> Self {
720 Self::new(HealthCheckerConfig::default().cache_ttl)
721 }
722
723 /// Creates a new HealthChecker with the specified configuration.
724 pub fn with_config(config: HealthCheckerConfig) -> Self {
725 Self::new(config.cache_ttl)
726 }
727
728 /// Adds a health check to the checker.
729 ///
730 /// Health checks are executed in the order they are added.
731 ///
732 /// # Arguments
733 ///
734 /// * `check` - Boxed health check implementation
735 ///
736 /// # Example
737 ///
738 /// ```rust,no_run
739 /// use elara_runtime::health::{HealthChecker, HealthCheck, HealthCheckResult};
740 /// use std::time::Duration;
741 ///
742 /// struct MyCheck;
743 /// impl HealthCheck for MyCheck {
744 /// fn name(&self) -> &str { "my_check" }
745 /// fn check(&self) -> HealthCheckResult { HealthCheckResult::Healthy }
746 /// }
747 ///
748 /// let mut checker = HealthChecker::new(Duration::from_secs(30));
749 /// checker.add_check(Box::new(MyCheck));
750 /// ```
751 pub fn add_check(&mut self, check: Box<dyn HealthCheck>) {
752 self.checks.push(check);
753 }
754
755 /// Checks the health of all registered checks.
756 ///
757 /// This method returns cached results if they are still valid (within TTL).
758 /// If the cache is expired or empty, it executes all health checks and
759 /// updates the cache.
760 ///
761 /// # Returns
762 ///
763 /// `HealthStatus` containing the overall status and individual check results.
764 ///
765 /// # Performance
766 ///
767 /// - Cached reads: O(1) with read lock
768 /// - Cache miss: O(n) where n is the number of checks, with write lock
769 ///
770 /// # Example
771 ///
772 /// ```rust,no_run
773 /// use elara_runtime::health::HealthChecker;
774 /// use std::time::Duration;
775 ///
776 /// let checker = HealthChecker::new(Duration::from_secs(30));
777 /// let status = checker.check_health();
778 ///
779 /// if status.is_healthy() {
780 /// println!("All systems operational");
781 /// } else {
782 /// println!("System degraded or unhealthy");
783 /// }
784 /// ```
785 pub fn check_health(&self) -> HealthStatus {
786 // Fast path: check if cache is valid
787 {
788 let cache = self.cache.read();
789 if let Some(ref status) = *cache {
790 if status.timestamp.elapsed() < self.cache_ttl {
791 return status.clone();
792 }
793 }
794 }
795
796 // Slow path: execute health checks and update cache
797 let mut results = HashMap::new();
798
799 for check in &self.checks {
800 let result = check.check();
801 results.insert(check.name().to_string(), result);
802 }
803
804 let status = HealthStatus::new(results);
805
806 // Update cache
807 {
808 let mut cache = self.cache.write();
809 *cache = Some(status.clone());
810 }
811
812 status
813 }
814
815 /// Clears the cached health status, forcing the next check to execute.
816 ///
817 /// This is useful for testing or when you need to force a fresh health check.
818 pub fn clear_cache(&self) {
819 let mut cache = self.cache.write();
820 *cache = None;
821 }
822
823 /// Returns the number of registered health checks.
824 pub fn check_count(&self) -> usize {
825 self.checks.len()
826 }
827
828 /// Returns the cache TTL duration.
829 pub fn cache_ttl(&self) -> Duration {
830 self.cache_ttl
831 }
832}
833
834#[cfg(test)]
835mod tests {
836 use super::*;
837
838 struct AlwaysHealthyCheck;
839 impl HealthCheck for AlwaysHealthyCheck {
840 fn name(&self) -> &str {
841 "always_healthy"
842 }
843 fn check(&self) -> HealthCheckResult {
844 HealthCheckResult::Healthy
845 }
846 }
847
848 struct AlwaysDegradedCheck;
849 impl HealthCheck for AlwaysDegradedCheck {
850 fn name(&self) -> &str {
851 "always_degraded"
852 }
853 fn check(&self) -> HealthCheckResult {
854 HealthCheckResult::Degraded {
855 reason: "Test degradation".to_string(),
856 }
857 }
858 }
859
860 struct AlwaysUnhealthyCheck;
861 impl HealthCheck for AlwaysUnhealthyCheck {
862 fn name(&self) -> &str {
863 "always_unhealthy"
864 }
865 fn check(&self) -> HealthCheckResult {
866 HealthCheckResult::Unhealthy {
867 reason: "Test failure".to_string(),
868 }
869 }
870 }
871
872 #[test]
873 fn test_health_check_result_methods() {
874 let healthy = HealthCheckResult::Healthy;
875 assert!(healthy.is_healthy());
876 assert!(!healthy.is_degraded());
877 assert!(!healthy.is_unhealthy());
878 assert_eq!(healthy.reason(), None);
879
880 let degraded = HealthCheckResult::Degraded {
881 reason: "test".to_string(),
882 };
883 assert!(!degraded.is_healthy());
884 assert!(degraded.is_degraded());
885 assert!(!degraded.is_unhealthy());
886 assert_eq!(degraded.reason(), Some("test"));
887
888 let unhealthy = HealthCheckResult::Unhealthy {
889 reason: "test".to_string(),
890 };
891 assert!(!unhealthy.is_healthy());
892 assert!(!unhealthy.is_degraded());
893 assert!(unhealthy.is_unhealthy());
894 assert_eq!(unhealthy.reason(), Some("test"));
895 }
896
897 #[test]
898 fn test_health_checker_empty() {
899 let checker = HealthChecker::new(Duration::from_secs(30));
900 let status = checker.check_health();
901 assert!(status.is_healthy());
902 assert_eq!(status.checks.len(), 0);
903 }
904
905 #[test]
906 fn test_health_checker_all_healthy() {
907 let mut checker = HealthChecker::new(Duration::from_secs(30));
908 checker.add_check(Box::new(AlwaysHealthyCheck));
909
910 let status = checker.check_health();
911 assert!(status.is_healthy());
912 assert_eq!(status.checks.len(), 1);
913 }
914
915 #[test]
916 fn test_health_checker_degraded() {
917 let mut checker = HealthChecker::new(Duration::from_secs(30));
918 checker.add_check(Box::new(AlwaysHealthyCheck));
919 checker.add_check(Box::new(AlwaysDegradedCheck));
920
921 let status = checker.check_health();
922 assert!(status.is_degraded());
923 assert_eq!(status.checks.len(), 2);
924 }
925
926 #[test]
927 fn test_health_checker_unhealthy() {
928 let mut checker = HealthChecker::new(Duration::from_secs(30));
929 checker.add_check(Box::new(AlwaysHealthyCheck));
930 checker.add_check(Box::new(AlwaysDegradedCheck));
931 checker.add_check(Box::new(AlwaysUnhealthyCheck));
932
933 let status = checker.check_health();
934 assert!(status.is_unhealthy());
935 assert_eq!(status.checks.len(), 3);
936 }
937
938 #[test]
939 fn test_health_checker_precedence() {
940 // Unhealthy takes precedence over Degraded
941 let mut checker = HealthChecker::new(Duration::from_secs(30));
942 checker.add_check(Box::new(AlwaysDegradedCheck));
943 checker.add_check(Box::new(AlwaysUnhealthyCheck));
944
945 let status = checker.check_health();
946 assert!(status.is_unhealthy());
947
948 // Degraded takes precedence over Healthy
949 let mut checker = HealthChecker::new(Duration::from_secs(30));
950 checker.add_check(Box::new(AlwaysHealthyCheck));
951 checker.add_check(Box::new(AlwaysDegradedCheck));
952
953 let status = checker.check_health();
954 assert!(status.is_degraded());
955 }
956
957 #[test]
958 fn test_health_checker_caching() {
959 let mut checker = HealthChecker::new(Duration::from_millis(100));
960 checker.add_check(Box::new(AlwaysHealthyCheck));
961
962 // First call should execute checks
963 let status1 = checker.check_health();
964 let timestamp1 = status1.timestamp;
965
966 // Second call within TTL should return cached result
967 let status2 = checker.check_health();
968 let timestamp2 = status2.timestamp;
969
970 assert_eq!(timestamp1, timestamp2);
971
972 // Wait for cache to expire
973 std::thread::sleep(Duration::from_millis(150));
974
975 // Third call after TTL should execute checks again
976 let status3 = checker.check_health();
977 let timestamp3 = status3.timestamp;
978
979 assert!(timestamp3 > timestamp1);
980 }
981
982 #[test]
983 fn test_health_checker_clear_cache() {
984 let mut checker = HealthChecker::new(Duration::from_secs(30));
985 checker.add_check(Box::new(AlwaysHealthyCheck));
986
987 let status1 = checker.check_health();
988 let timestamp1 = status1.timestamp;
989
990 checker.clear_cache();
991
992 let status2 = checker.check_health();
993 let timestamp2 = status2.timestamp;
994
995 assert!(timestamp2 > timestamp1);
996 }
997
998 #[test]
999 fn test_health_status_aggregation() {
1000 let mut checks = HashMap::new();
1001
1002 // All healthy
1003 checks.insert("check1".to_string(), HealthCheckResult::Healthy);
1004 checks.insert("check2".to_string(), HealthCheckResult::Healthy);
1005 let status = HealthStatus::new(checks.clone());
1006 assert!(status.is_healthy());
1007
1008 // One degraded
1009 checks.insert("check3".to_string(), HealthCheckResult::Degraded {
1010 reason: "test".to_string(),
1011 });
1012 let status = HealthStatus::new(checks.clone());
1013 assert!(status.is_degraded());
1014
1015 // One unhealthy
1016 checks.insert("check4".to_string(), HealthCheckResult::Unhealthy {
1017 reason: "test".to_string(),
1018 });
1019 let status = HealthStatus::new(checks);
1020 assert!(status.is_unhealthy());
1021 }
1022}
1023
1024// ============================================================================
1025// Built-in Health Checks
1026// ============================================================================
1027
1028/// Health check for monitoring active connection count.
1029///
1030/// This check verifies that the node has at least a minimum number of active
1031/// connections. Having too few connections may indicate network issues,
1032/// configuration problems, or that the node is isolated from the network.
1033///
1034/// # Status Determination
1035///
1036/// - `Healthy`: Active connections >= min_connections
1037/// - `Degraded`: Active connections < min_connections
1038///
1039/// # Example
1040///
1041/// ```rust,no_run
1042/// use elara_runtime::health::{ConnectionHealthCheck, HealthCheck};
1043/// use elara_runtime::node::Node;
1044/// use std::sync::Arc;
1045///
1046/// let node = Arc::new(Node::new());
1047/// let check = ConnectionHealthCheck::new(node, 3);
1048/// let result = check.check();
1049/// ```
1050pub struct ConnectionHealthCheck {
1051 /// Reference to the node to check
1052 _node: Arc<crate::node::Node>,
1053 /// Minimum number of connections required for healthy status
1054 min_connections: usize,
1055}
1056
1057impl ConnectionHealthCheck {
1058 /// Creates a new ConnectionHealthCheck.
1059 ///
1060 /// # Arguments
1061 ///
1062 /// * `node` - Arc reference to the Node to monitor
1063 /// * `min_connections` - Minimum number of active connections for healthy status
1064 ///
1065 /// # Example
1066 ///
1067 /// ```rust,no_run
1068 /// use elara_runtime::health::ConnectionHealthCheck;
1069 /// use elara_runtime::node::Node;
1070 /// use std::sync::Arc;
1071 ///
1072 /// let node = Arc::new(Node::new());
1073 /// let check = ConnectionHealthCheck::new(node, 3);
1074 /// ```
1075 pub fn new(node: Arc<crate::node::Node>, min_connections: usize) -> Self {
1076 Self {
1077 _node: node,
1078 min_connections,
1079 }
1080 }
1081
1082 /// Returns the configured minimum connections threshold.
1083 pub fn min_connections(&self) -> usize {
1084 self.min_connections
1085 }
1086}
1087
1088impl HealthCheck for ConnectionHealthCheck {
1089 fn name(&self) -> &str {
1090 "connections"
1091 }
1092
1093 fn check(&self) -> HealthCheckResult {
1094 // For now, we'll use a placeholder since Node doesn't have active_connections() yet
1095 // In a real implementation, this would query the actual connection count
1096 // from the transport layer or session manager
1097 let active = 0; // TODO: Implement node.active_connections()
1098
1099 if active >= self.min_connections {
1100 HealthCheckResult::Healthy
1101 } else {
1102 HealthCheckResult::Degraded {
1103 reason: format!(
1104 "Only {} active connections (minimum: {})",
1105 active, self.min_connections
1106 ),
1107 }
1108 }
1109 }
1110}
1111
1112/// Health check for monitoring memory usage.
1113///
1114/// This check monitors the process memory usage and compares it against
1115/// a configured maximum threshold. Excessive memory usage can lead to
1116/// OOM kills, performance degradation, and system instability.
1117///
1118/// Uses the `sysinfo` crate to obtain real memory usage statistics.
1119///
1120/// # Status Determination
1121///
1122/// - `Healthy`: Memory usage < max_memory_mb
1123/// - `Unhealthy`: Memory usage >= max_memory_mb
1124///
1125/// # Example
1126///
1127/// ```rust
1128/// use elara_runtime::health::{MemoryHealthCheck, HealthCheck};
1129///
1130/// let check = MemoryHealthCheck::new(1800); // 1800 MB limit
1131/// let result = check.check();
1132/// ```
1133pub struct MemoryHealthCheck {
1134 /// Maximum memory usage in megabytes before unhealthy
1135 max_memory_mb: usize,
1136 /// System information provider (cached for efficiency)
1137 system: Arc<RwLock<sysinfo::System>>,
1138}
1139
1140impl MemoryHealthCheck {
1141 /// Creates a new MemoryHealthCheck.
1142 ///
1143 /// # Arguments
1144 ///
1145 /// * `max_memory_mb` - Maximum memory usage in MB before marking unhealthy
1146 ///
1147 /// # Example
1148 ///
1149 /// ```rust
1150 /// use elara_runtime::health::MemoryHealthCheck;
1151 ///
1152 /// let check = MemoryHealthCheck::new(2048); // 2GB limit
1153 /// ```
1154 pub fn new(max_memory_mb: usize) -> Self {
1155 Self {
1156 max_memory_mb,
1157 system: Arc::new(RwLock::new(sysinfo::System::new_all())),
1158 }
1159 }
1160
1161 /// Returns the configured maximum memory threshold in MB.
1162 pub fn max_memory_mb(&self) -> usize {
1163 self.max_memory_mb
1164 }
1165
1166 /// Gets the current memory usage in megabytes.
1167 ///
1168 /// This method refreshes the system memory information and returns
1169 /// the current process memory usage.
1170 fn get_memory_usage_mb(&self) -> usize {
1171 let mut system = self.system.write();
1172 system.refresh_memory();
1173 system.refresh_processes();
1174
1175 // Get current process PID
1176 let pid = sysinfo::get_current_pid().ok();
1177
1178 if let Some(pid) = pid {
1179 if let Some(process) = system.process(pid) {
1180 // Convert bytes to megabytes
1181 return (process.memory() / 1_048_576) as usize;
1182 }
1183 }
1184
1185 // Fallback: return 0 if we can't get process info
1186 0
1187 }
1188}
1189
1190impl HealthCheck for MemoryHealthCheck {
1191 fn name(&self) -> &str {
1192 "memory"
1193 }
1194
1195 fn check(&self) -> HealthCheckResult {
1196 let usage_mb = self.get_memory_usage_mb();
1197
1198 if usage_mb < self.max_memory_mb {
1199 HealthCheckResult::Healthy
1200 } else {
1201 HealthCheckResult::Unhealthy {
1202 reason: format!(
1203 "Memory usage {}MB exceeds limit {}MB",
1204 usage_mb, self.max_memory_mb
1205 ),
1206 }
1207 }
1208 }
1209}
1210
1211/// Health check for monitoring time drift.
1212///
1213/// This check monitors the time drift between the local node and the
1214/// network consensus time. Excessive time drift can cause synchronization
1215/// issues, event ordering problems, and state divergence.
1216///
1217/// # Status Determination
1218///
1219/// - `Healthy`: |time_drift| < max_drift_ms
1220/// - `Degraded`: |time_drift| >= max_drift_ms
1221///
1222/// # Example
1223///
1224/// ```rust,no_run
1225/// use elara_runtime::health::{TimeDriftCheck, HealthCheck};
1226/// use elara_runtime::node::Node;
1227/// use std::sync::Arc;
1228///
1229/// let node = Arc::new(Node::new());
1230/// let check = TimeDriftCheck::new(node, 100); // 100ms max drift
1231/// let result = check.check();
1232/// ```
1233pub struct TimeDriftCheck {
1234 /// Reference to the node to check
1235 node: Arc<crate::node::Node>,
1236 /// Maximum acceptable time drift in milliseconds
1237 max_drift_ms: i64,
1238}
1239
1240impl TimeDriftCheck {
1241 /// Creates a new TimeDriftCheck.
1242 ///
1243 /// # Arguments
1244 ///
1245 /// * `node` - Arc reference to the Node to monitor
1246 /// * `max_drift_ms` - Maximum acceptable time drift in milliseconds
1247 ///
1248 /// # Example
1249 ///
1250 /// ```rust,no_run
1251 /// use elara_runtime::health::TimeDriftCheck;
1252 /// use elara_runtime::node::Node;
1253 /// use std::sync::Arc;
1254 ///
1255 /// let node = Arc::new(Node::new());
1256 /// let check = TimeDriftCheck::new(node, 100);
1257 /// ```
1258 pub fn new(node: Arc<crate::node::Node>, max_drift_ms: i64) -> Self {
1259 Self {
1260 node,
1261 max_drift_ms,
1262 }
1263 }
1264
1265 /// Returns the configured maximum drift threshold in milliseconds.
1266 pub fn max_drift_ms(&self) -> i64 {
1267 self.max_drift_ms
1268 }
1269
1270 /// Gets the current time drift in milliseconds.
1271 ///
1272 /// This queries the time engine to determine the drift between
1273 /// local time and network consensus time.
1274 fn get_time_drift_ms(&self) -> i64 {
1275 // Access the time engine to get drift information
1276 let time_engine = self.node.time_engine();
1277
1278 // Get the current drift from the time engine
1279 // The drift is the difference between local time and network time
1280 time_engine.drift_ms()
1281 }
1282}
1283
1284impl HealthCheck for TimeDriftCheck {
1285 fn name(&self) -> &str {
1286 "time_drift"
1287 }
1288
1289 fn check(&self) -> HealthCheckResult {
1290 let drift_ms = self.get_time_drift_ms();
1291 let abs_drift = drift_ms.abs();
1292
1293 if abs_drift < self.max_drift_ms {
1294 HealthCheckResult::Healthy
1295 } else {
1296 HealthCheckResult::Degraded {
1297 reason: format!(
1298 "Time drift {}ms exceeds limit {}ms",
1299 drift_ms, self.max_drift_ms
1300 ),
1301 }
1302 }
1303 }
1304}
1305
1306/// Health check for monitoring state convergence.
1307///
1308/// This check monitors the state reconciliation engine to ensure that
1309/// state is converging properly across the network. State divergence
1310/// can indicate network partitions, bugs in the reconciliation logic,
1311/// or other serious issues.
1312///
1313/// # Status Determination
1314///
1315/// - `Healthy`: State is converging normally
1316/// - `Degraded`: State convergence is slow or stalled
1317/// - `Unhealthy`: State divergence detected
1318///
1319/// # Example
1320///
1321/// ```rust,no_run
1322/// use elara_runtime::health::{StateDivergenceCheck, HealthCheck};
1323/// use elara_runtime::node::Node;
1324/// use std::sync::Arc;
1325///
1326/// let node = Arc::new(Node::new());
1327/// let check = StateDivergenceCheck::new(node);
1328/// let result = check.check();
1329/// ```
1330pub struct StateDivergenceCheck {
1331 /// Reference to the node to check
1332 node: Arc<crate::node::Node>,
1333 /// Maximum acceptable pending events before degraded
1334 max_pending_events: usize,
1335}
1336
1337impl StateDivergenceCheck {
1338 /// Creates a new StateDivergenceCheck.
1339 ///
1340 /// # Arguments
1341 ///
1342 /// * `node` - Arc reference to the Node to monitor
1343 ///
1344 /// # Example
1345 ///
1346 /// ```rust,no_run
1347 /// use elara_runtime::health::StateDivergenceCheck;
1348 /// use elara_runtime::node::Node;
1349 /// use std::sync::Arc;
1350 ///
1351 /// let node = Arc::new(Node::new());
1352 /// let check = StateDivergenceCheck::new(node);
1353 /// ```
1354 pub fn new(node: Arc<crate::node::Node>) -> Self {
1355 Self::with_threshold(node, 1000)
1356 }
1357
1358 /// Creates a new StateDivergenceCheck with a custom threshold.
1359 ///
1360 /// # Arguments
1361 ///
1362 /// * `node` - Arc reference to the Node to monitor
1363 /// * `max_pending_events` - Maximum pending events before degraded status
1364 ///
1365 /// # Example
1366 ///
1367 /// ```rust,no_run
1368 /// use elara_runtime::health::StateDivergenceCheck;
1369 /// use elara_runtime::node::Node;
1370 /// use std::sync::Arc;
1371 ///
1372 /// let node = Arc::new(Node::new());
1373 /// let check = StateDivergenceCheck::with_threshold(node, 500);
1374 /// ```
1375 pub fn with_threshold(node: Arc<crate::node::Node>, max_pending_events: usize) -> Self {
1376 Self {
1377 node,
1378 max_pending_events,
1379 }
1380 }
1381
1382 /// Returns the configured maximum pending events threshold.
1383 pub fn max_pending_events(&self) -> usize {
1384 self.max_pending_events
1385 }
1386
1387 /// Checks the state convergence status.
1388 ///
1389 /// This examines the reconciliation engine to determine if state
1390 /// is converging properly.
1391 fn check_convergence(&self) -> (bool, usize) {
1392 // Access the state engine to check convergence
1393 let state_engine = self.node.state_engine();
1394
1395 // Get the number of pending events that haven't been reconciled
1396 // In a real implementation, this would query the reconciliation engine
1397 // for metrics about pending events, unmerged states, etc.
1398 let pending_events = state_engine.pending_count();
1399
1400 // Check if we're converging (pending count is reasonable)
1401 let is_converging = pending_events < self.max_pending_events;
1402
1403 (is_converging, pending_events)
1404 }
1405}
1406
1407impl HealthCheck for StateDivergenceCheck {
1408 fn name(&self) -> &str {
1409 "state_convergence"
1410 }
1411
1412 fn check(&self) -> HealthCheckResult {
1413 let (is_converging, pending_events) = self.check_convergence();
1414
1415 if is_converging {
1416 HealthCheckResult::Healthy
1417 } else {
1418 HealthCheckResult::Degraded {
1419 reason: format!(
1420 "State convergence slow: {} pending events (threshold: {})",
1421 pending_events, self.max_pending_events
1422 ),
1423 }
1424 }
1425 }
1426}
1427
1428#[cfg(test)]
1429mod builtin_tests {
1430 use super::*;
1431 use crate::node::Node;
1432
1433 #[test]
1434 fn test_memory_health_check() {
1435 // Test with a very high threshold (should be healthy)
1436 let check = MemoryHealthCheck::new(100_000); // 100GB
1437 let result = check.check();
1438 assert!(result.is_healthy(), "Should be healthy with high threshold");
1439
1440 // Test with a very low threshold (should be unhealthy)
1441 let check = MemoryHealthCheck::new(1); // 1MB
1442 let result = check.check();
1443 assert!(result.is_unhealthy(), "Should be unhealthy with low threshold");
1444 }
1445
1446 #[test]
1447 fn test_memory_health_check_threshold() {
1448 let check = MemoryHealthCheck::new(1800);
1449 assert_eq!(check.max_memory_mb(), 1800);
1450 }
1451
1452 #[test]
1453 fn test_connection_health_check_creation() {
1454 let node = Arc::new(Node::new());
1455 let check = ConnectionHealthCheck::new(node, 3);
1456 assert_eq!(check.name(), "connections");
1457 assert_eq!(check.min_connections(), 3);
1458 }
1459
1460 #[test]
1461 fn test_time_drift_check_creation() {
1462 let node = Arc::new(Node::new());
1463 let check = TimeDriftCheck::new(node, 100);
1464 assert_eq!(check.name(), "time_drift");
1465 assert_eq!(check.max_drift_ms(), 100);
1466 }
1467
1468 #[test]
1469 fn test_state_divergence_check_creation() {
1470 let node = Arc::new(Node::new());
1471 let check = StateDivergenceCheck::new(node.clone());
1472 assert_eq!(check.name(), "state_convergence");
1473 assert_eq!(check.max_pending_events(), 1000);
1474
1475 let check = StateDivergenceCheck::with_threshold(node, 500);
1476 assert_eq!(check.max_pending_events(), 500);
1477 }
1478
1479 #[test]
1480 fn test_all_builtin_checks_with_health_checker() {
1481 let node = Arc::new(Node::new());
1482 let mut checker = HealthChecker::new(Duration::from_secs(30));
1483
1484 // Add all built-in checks
1485 checker.add_check(Box::new(ConnectionHealthCheck::new(node.clone(), 3)));
1486 checker.add_check(Box::new(MemoryHealthCheck::new(100_000)));
1487 checker.add_check(Box::new(TimeDriftCheck::new(node.clone(), 100)));
1488 checker.add_check(Box::new(StateDivergenceCheck::new(node)));
1489
1490 assert_eq!(checker.check_count(), 4);
1491
1492 // Run health check
1493 let status = checker.check_health();
1494 assert_eq!(status.checks.len(), 4);
1495
1496 // Verify all checks are present
1497 assert!(status.checks.contains_key("connections"));
1498 assert!(status.checks.contains_key("memory"));
1499 assert!(status.checks.contains_key("time_drift"));
1500 assert!(status.checks.contains_key("state_convergence"));
1501 }
1502
1503 #[test]
1504 fn test_health_check_config_default() {
1505 let config = HealthCheckConfig::default();
1506
1507 assert!(config.enabled);
1508 assert!(config.server_bind_address.is_some());
1509 assert_eq!(config.cache_ttl, Duration::from_secs(30));
1510 assert_eq!(config.min_connections, Some(3));
1511 assert_eq!(config.max_memory_mb, Some(1800));
1512 assert_eq!(config.max_time_drift_ms, Some(100));
1513 assert_eq!(config.max_pending_events, Some(1000));
1514
1515 // Default config should be valid
1516 assert!(config.validate().is_ok());
1517 }
1518
1519 #[test]
1520 fn test_health_check_config_disabled() {
1521 let config = HealthCheckConfig::disabled();
1522
1523 assert!(!config.enabled);
1524 assert!(config.server_bind_address.is_none());
1525 assert!(config.min_connections.is_none());
1526 assert!(config.max_memory_mb.is_none());
1527 assert!(config.max_time_drift_ms.is_none());
1528 assert!(config.max_pending_events.is_none());
1529
1530 // Disabled config should still be valid
1531 assert!(config.validate().is_ok());
1532 }
1533
1534 #[test]
1535 fn test_health_check_config_small_deployment() {
1536 let config = HealthCheckConfig::small_deployment();
1537
1538 assert!(config.enabled);
1539 assert_eq!(config.min_connections, Some(2));
1540 assert_eq!(config.max_memory_mb, Some(1000));
1541 assert_eq!(config.max_time_drift_ms, Some(100));
1542 assert_eq!(config.max_pending_events, Some(500));
1543
1544 assert!(config.validate().is_ok());
1545 }
1546
1547 #[test]
1548 fn test_health_check_config_medium_deployment() {
1549 let config = HealthCheckConfig::medium_deployment();
1550
1551 assert!(config.enabled);
1552 assert_eq!(config.min_connections, Some(5));
1553 assert_eq!(config.max_memory_mb, Some(2000));
1554 assert_eq!(config.max_time_drift_ms, Some(100));
1555 assert_eq!(config.max_pending_events, Some(1000));
1556
1557 assert!(config.validate().is_ok());
1558 }
1559
1560 #[test]
1561 fn test_health_check_config_large_deployment() {
1562 let config = HealthCheckConfig::large_deployment();
1563
1564 assert!(config.enabled);
1565 assert_eq!(config.min_connections, Some(10));
1566 assert_eq!(config.max_memory_mb, Some(4000));
1567 assert_eq!(config.max_time_drift_ms, Some(100));
1568 assert_eq!(config.max_pending_events, Some(2000));
1569
1570 assert!(config.validate().is_ok());
1571 }
1572
1573 #[test]
1574 fn test_health_check_config_validation_cache_ttl() {
1575 let mut config = HealthCheckConfig::default();
1576
1577 // Valid cache TTL
1578 config.cache_ttl = Duration::from_secs(1);
1579 assert!(config.validate().is_ok());
1580
1581 // Invalid cache TTL (too short)
1582 config.cache_ttl = Duration::from_millis(999);
1583 assert!(config.validate().is_err());
1584 assert_eq!(
1585 config.validate().unwrap_err(),
1586 "cache_ttl must be at least 1 second"
1587 );
1588 }
1589
1590 #[test]
1591 fn test_health_check_config_validation_min_connections() {
1592 let mut config = HealthCheckConfig::default();
1593
1594 // Valid min_connections
1595 config.min_connections = Some(1);
1596 assert!(config.validate().is_ok());
1597
1598 // Invalid min_connections (zero)
1599 config.min_connections = Some(0);
1600 assert!(config.validate().is_err());
1601 assert_eq!(
1602 config.validate().unwrap_err(),
1603 "min_connections must be greater than 0"
1604 );
1605
1606 // None is valid (check disabled)
1607 config.min_connections = None;
1608 assert!(config.validate().is_ok());
1609 }
1610
1611 #[test]
1612 fn test_health_check_config_validation_max_memory() {
1613 let mut config = HealthCheckConfig::default();
1614
1615 // Valid max_memory_mb
1616 config.max_memory_mb = Some(1);
1617 assert!(config.validate().is_ok());
1618
1619 // Invalid max_memory_mb (zero)
1620 config.max_memory_mb = Some(0);
1621 assert!(config.validate().is_err());
1622 assert_eq!(
1623 config.validate().unwrap_err(),
1624 "max_memory_mb must be greater than 0"
1625 );
1626
1627 // None is valid (check disabled)
1628 config.max_memory_mb = None;
1629 assert!(config.validate().is_ok());
1630 }
1631
1632 #[test]
1633 fn test_health_check_config_validation_max_time_drift() {
1634 let mut config = HealthCheckConfig::default();
1635
1636 // Valid max_time_drift_ms
1637 config.max_time_drift_ms = Some(1);
1638 assert!(config.validate().is_ok());
1639
1640 // Invalid max_time_drift_ms (zero)
1641 config.max_time_drift_ms = Some(0);
1642 assert!(config.validate().is_err());
1643 assert_eq!(
1644 config.validate().unwrap_err(),
1645 "max_time_drift_ms must be greater than 0"
1646 );
1647
1648 // Invalid max_time_drift_ms (negative)
1649 config.max_time_drift_ms = Some(-1);
1650 assert!(config.validate().is_err());
1651
1652 // None is valid (check disabled)
1653 config.max_time_drift_ms = None;
1654 assert!(config.validate().is_ok());
1655 }
1656
1657 #[test]
1658 fn test_health_check_config_validation_max_pending_events() {
1659 let mut config = HealthCheckConfig::default();
1660
1661 // Valid max_pending_events
1662 config.max_pending_events = Some(1);
1663 assert!(config.validate().is_ok());
1664
1665 // Invalid max_pending_events (zero)
1666 config.max_pending_events = Some(0);
1667 assert!(config.validate().is_err());
1668 assert_eq!(
1669 config.validate().unwrap_err(),
1670 "max_pending_events must be greater than 0"
1671 );
1672
1673 // None is valid (check disabled)
1674 config.max_pending_events = None;
1675 assert!(config.validate().is_ok());
1676 }
1677
1678 #[test]
1679 fn test_health_check_config_selective_checks() {
1680 // Test configuration with only some checks enabled
1681 let config = HealthCheckConfig {
1682 enabled: true,
1683 server_bind_address: None,
1684 cache_ttl: Duration::from_secs(30),
1685 min_connections: Some(5),
1686 max_memory_mb: None, // Disabled
1687 max_time_drift_ms: Some(100),
1688 max_pending_events: None, // Disabled
1689 };
1690
1691 assert!(config.validate().is_ok());
1692 assert!(config.enabled);
1693 assert!(config.min_connections.is_some());
1694 assert!(config.max_memory_mb.is_none());
1695 assert!(config.max_time_drift_ms.is_some());
1696 assert!(config.max_pending_events.is_none());
1697 }
1698}