ipfrs_network/
health.rs

1//! Network health check endpoints
2//!
3//! Provides comprehensive health status reporting including:
4//! - Overall network health
5//! - Component health (DHT, connections, etc.)
6//! - Historical health tracking
7//! - Health check HTTP endpoints (optional)
8
9use crate::{DhtHealth, DhtHealthStatus, NetworkMetrics};
10use serde::Serialize;
11use std::time::Instant;
12
13/// Overall network health status
14#[derive(Debug, Clone, Serialize, PartialEq)]
15pub enum NetworkHealthStatus {
16    /// All systems operational
17    Healthy,
18    /// Some degradation but operational
19    Degraded,
20    /// Critical issues affecting operation
21    Unhealthy,
22    /// Not enough data to determine health
23    Unknown,
24}
25
26/// Component health status
27#[derive(Debug, Clone, Serialize)]
28pub struct ComponentHealth {
29    /// Component name
30    pub name: String,
31    /// Component status
32    pub status: NetworkHealthStatus,
33    /// Optional message
34    pub message: Option<String>,
35    /// Health score (0.0 - 1.0)
36    pub score: f64,
37}
38
39/// Complete network health report
40#[derive(Debug, Clone, Serialize)]
41pub struct NetworkHealth {
42    /// Overall status
43    pub status: NetworkHealthStatus,
44    /// Overall health score (0.0 - 1.0)
45    pub score: f64,
46    /// Component health details
47    pub components: Vec<ComponentHealth>,
48    /// Time of health check
49    pub timestamp: u64,
50    /// Uptime in seconds
51    pub uptime_secs: u64,
52}
53
54impl NetworkHealth {
55    /// Check if the network is healthy
56    pub fn is_healthy(&self) -> bool {
57        self.status == NetworkHealthStatus::Healthy
58    }
59
60    /// Check if the network is degraded
61    pub fn is_degraded(&self) -> bool {
62        self.status == NetworkHealthStatus::Degraded
63    }
64
65    /// Check if the network is unhealthy
66    pub fn is_unhealthy(&self) -> bool {
67        self.status == NetworkHealthStatus::Unhealthy
68    }
69}
70
71/// Health checker for network components
72pub struct HealthChecker {
73    /// Last health check result
74    last_check: parking_lot::RwLock<Option<NetworkHealth>>,
75    /// Health check history (last 100 checks)
76    history: parking_lot::RwLock<Vec<(Instant, NetworkHealthStatus)>>,
77    /// Maximum history size
78    max_history: usize,
79}
80
81impl HealthChecker {
82    /// Create a new health checker
83    pub fn new() -> Self {
84        Self {
85            last_check: parking_lot::RwLock::new(None),
86            history: parking_lot::RwLock::new(Vec::new()),
87            max_history: 100,
88        }
89    }
90
91    /// Perform a health check
92    pub fn check_health(
93        &self,
94        metrics: &NetworkMetrics,
95        dht_health: Option<&DhtHealth>,
96    ) -> NetworkHealth {
97        let mut components = Vec::new();
98        let mut total_score = 0.0;
99        let mut component_count = 0;
100
101        // Check connection health
102        let connection_health = self.check_connection_health(metrics);
103        total_score += connection_health.score;
104        component_count += 1;
105        components.push(connection_health);
106
107        // Check DHT health if available
108        if let Some(dht) = dht_health {
109            let dht_component = self.check_dht_health(dht);
110            total_score += dht_component.score;
111            component_count += 1;
112            components.push(dht_component);
113        }
114
115        // Check bandwidth health
116        let bandwidth_health = self.check_bandwidth_health(metrics);
117        total_score += bandwidth_health.score;
118        component_count += 1;
119        components.push(bandwidth_health);
120
121        // Calculate overall score and status
122        let overall_score = if component_count > 0 {
123            total_score / component_count as f64
124        } else {
125            0.0
126        };
127
128        let overall_status = if overall_score >= 0.8 {
129            NetworkHealthStatus::Healthy
130        } else if overall_score >= 0.5 {
131            NetworkHealthStatus::Degraded
132        } else if overall_score > 0.0 {
133            NetworkHealthStatus::Unhealthy
134        } else {
135            NetworkHealthStatus::Unknown
136        };
137
138        let health = NetworkHealth {
139            status: overall_status.clone(),
140            score: overall_score,
141            components,
142            timestamp: std::time::SystemTime::now()
143                .duration_since(std::time::UNIX_EPOCH)
144                .unwrap()
145                .as_secs(),
146            uptime_secs: metrics.uptime().as_secs(),
147        };
148
149        // Store in history
150        let mut history = self.history.write();
151        history.push((Instant::now(), overall_status));
152        if history.len() > self.max_history {
153            history.remove(0);
154        }
155
156        // Store last check
157        *self.last_check.write() = Some(health.clone());
158
159        health
160    }
161
162    /// Get the last health check result
163    pub fn last_health(&self) -> Option<NetworkHealth> {
164        self.last_check.read().clone()
165    }
166
167    /// Get health history summary
168    pub fn health_history(&self) -> HealthHistory {
169        let history = self.history.read();
170        let total = history.len();
171
172        if total == 0 {
173            return HealthHistory {
174                total_checks: 0,
175                healthy_count: 0,
176                degraded_count: 0,
177                unhealthy_count: 0,
178                unknown_count: 0,
179                healthy_percentage: 0.0,
180            };
181        }
182
183        let mut healthy_count = 0;
184        let mut degraded_count = 0;
185        let mut unhealthy_count = 0;
186        let mut unknown_count = 0;
187
188        for (_, status) in history.iter() {
189            match status {
190                NetworkHealthStatus::Healthy => healthy_count += 1,
191                NetworkHealthStatus::Degraded => degraded_count += 1,
192                NetworkHealthStatus::Unhealthy => unhealthy_count += 1,
193                NetworkHealthStatus::Unknown => unknown_count += 1,
194            }
195        }
196
197        HealthHistory {
198            total_checks: total,
199            healthy_count,
200            degraded_count,
201            unhealthy_count,
202            unknown_count,
203            healthy_percentage: (healthy_count as f64 / total as f64) * 100.0,
204        }
205    }
206
207    /// Check connection health
208    fn check_connection_health(&self, metrics: &NetworkMetrics) -> ComponentHealth {
209        let snapshot = metrics.connections().snapshot();
210        let total = snapshot.total_established;
211        let failed = snapshot.total_failed;
212        let active = snapshot.active;
213
214        let success_rate = if total > 0 {
215            (total - failed) as f64 / total as f64
216        } else {
217            1.0 // No connections yet, assume healthy
218        };
219
220        let has_connections = active > 0;
221
222        let score = if !has_connections && total == 0 {
223            0.5 // Starting up, no connections yet
224        } else if !has_connections {
225            0.3 // Had connections but lost them all
226        } else {
227            success_rate
228        };
229
230        let status = if score >= 0.8 {
231            NetworkHealthStatus::Healthy
232        } else if score >= 0.5 {
233            NetworkHealthStatus::Degraded
234        } else {
235            NetworkHealthStatus::Unhealthy
236        };
237
238        let message = if !has_connections && total > 0 {
239            Some("No active connections".to_string())
240        } else if success_rate < 0.5 {
241            Some(format!(
242                "High connection failure rate: {:.1}%",
243                (1.0 - success_rate) * 100.0
244            ))
245        } else {
246            None
247        };
248
249        ComponentHealth {
250            name: "connections".to_string(),
251            status,
252            message,
253            score,
254        }
255    }
256
257    /// Check DHT health
258    fn check_dht_health(&self, dht_health: &DhtHealth) -> ComponentHealth {
259        let status = match dht_health.status {
260            DhtHealthStatus::Healthy => NetworkHealthStatus::Healthy,
261            DhtHealthStatus::Degraded => NetworkHealthStatus::Degraded,
262            DhtHealthStatus::Unhealthy => NetworkHealthStatus::Unhealthy,
263            DhtHealthStatus::Unknown => NetworkHealthStatus::Unknown,
264        };
265
266        let message = if dht_health.peer_count == 0 {
267            Some("No peers in routing table".to_string())
268        } else if dht_health.query_success_rate < 0.5 {
269            Some(format!(
270                "Low query success rate: {:.1}%",
271                dht_health.query_success_rate * 100.0
272            ))
273        } else {
274            None
275        };
276
277        ComponentHealth {
278            name: "dht".to_string(),
279            status,
280            message,
281            score: dht_health.health_score,
282        }
283    }
284
285    /// Check bandwidth health
286    fn check_bandwidth_health(&self, metrics: &NetworkMetrics) -> ComponentHealth {
287        let snapshot = metrics.bandwidth().snapshot();
288        let total_traffic = snapshot.total_sent + snapshot.total_received;
289
290        // Simple heuristic: if we have active connections but no traffic, something might be wrong
291        let connections = metrics.connections().active();
292
293        let score = if connections == 0 {
294            0.8 // No connections, can't judge bandwidth
295        } else if total_traffic == 0 {
296            0.5 // Have connections but no traffic (might be normal for new connections)
297        } else {
298            1.0 // Have traffic, all good
299        };
300
301        let status = if score >= 0.8 {
302            NetworkHealthStatus::Healthy
303        } else if score >= 0.5 {
304            NetworkHealthStatus::Degraded
305        } else {
306            NetworkHealthStatus::Unhealthy
307        };
308
309        ComponentHealth {
310            name: "bandwidth".to_string(),
311            status,
312            message: None,
313            score,
314        }
315    }
316}
317
318impl Default for HealthChecker {
319    fn default() -> Self {
320        Self::new()
321    }
322}
323
324/// Health history summary
325#[derive(Debug, Clone, Serialize)]
326pub struct HealthHistory {
327    /// Total health checks performed
328    pub total_checks: usize,
329    /// Number of healthy checks
330    pub healthy_count: usize,
331    /// Number of degraded checks
332    pub degraded_count: usize,
333    /// Number of unhealthy checks
334    pub unhealthy_count: usize,
335    /// Number of unknown checks
336    pub unknown_count: usize,
337    /// Percentage of healthy checks
338    pub healthy_percentage: f64,
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344    use crate::metrics::NetworkMetrics;
345
346    #[test]
347    fn test_health_checker_creation() {
348        let checker = HealthChecker::new();
349        assert!(checker.last_health().is_none());
350    }
351
352    #[test]
353    fn test_health_check_no_connections() {
354        let checker = HealthChecker::new();
355        let metrics = NetworkMetrics::new();
356
357        let health = checker.check_health(&metrics, None);
358
359        // Should be degraded or unknown with no connections
360        assert!(
361            health.status == NetworkHealthStatus::Degraded
362                || health.status == NetworkHealthStatus::Unknown
363        );
364    }
365
366    #[test]
367    fn test_health_check_with_connections() {
368        let checker = HealthChecker::new();
369        let metrics = NetworkMetrics::new();
370
371        // Simulate successful connections
372        metrics.connections().connection_established(true);
373        metrics.connections().connection_established(false);
374
375        let health = checker.check_health(&metrics, None);
376
377        // Should be healthy with successful connections
378        assert_eq!(health.components.len(), 2); // connections + bandwidth
379        assert_eq!(health.components[0].name, "connections");
380    }
381
382    #[test]
383    fn test_health_history() {
384        let checker = HealthChecker::new();
385        let metrics = NetworkMetrics::new();
386
387        // Perform multiple checks
388        for _ in 0..5 {
389            checker.check_health(&metrics, None);
390        }
391
392        let history = checker.health_history();
393        assert_eq!(history.total_checks, 5);
394    }
395
396    #[test]
397    fn test_health_status_determination() {
398        let checker = HealthChecker::new();
399        let metrics = NetworkMetrics::new();
400
401        // Add connections and traffic
402        metrics.connections().connection_established(true);
403        metrics.bandwidth().record_sent(1000);
404        metrics.bandwidth().record_received(2000);
405
406        let health = checker.check_health(&metrics, None);
407
408        // Should be healthy with active connections and traffic
409        assert!(health.score > 0.5);
410    }
411
412    #[test]
413    fn test_last_health_stored() {
414        let checker = HealthChecker::new();
415        let metrics = NetworkMetrics::new();
416
417        let health1 = checker.check_health(&metrics, None);
418        let last = checker.last_health().unwrap();
419
420        assert_eq!(health1.timestamp, last.timestamp);
421        assert_eq!(health1.score, last.score);
422    }
423}